def test_formatting_yelp_nyc_gives_correct_genuine_label(): review = review_pb2.Review() format_yelp_nyc_review(review, _get_entry_text(), _get_nyc_metadata(label=1)) assert review.label == False
def protify_data(data_directory): # NYC review_set = review_set_pb2.ReviewSet() with open( os.path.normpath(data_directory + '/YelpData/YelpNYC/reviewContent'), 'r') as f1: with open( os.path.normpath(data_directory + '/YelpData/YelpNYC/metadata'), 'r') as f2: for line in f1: format_yelp_nyc_review(review_set.reviews.add(), line, f2.readline()) with open(os.path.normpath(data_directory + '/normalizedData/yelpNYC'), 'wb') as f: f.write(review_set.SerializeToString()) # Zip review_set = review_set_pb2.ReviewSet() with open( os.path.normpath(data_directory + '/YelpData/YelpZip/reviewContent'), 'r') as f1: with open( os.path.normpath(data_directory + '/YelpData/YelpZip/metadata'), 'r') as f2: for line in f1: format_yelp_nyc_review(review_set.reviews.add(), line, f2.readline()) with open(os.path.normpath(data_directory + '/normalizedData/yelpZip'), 'wb') as f: f.write(review_set.SerializeToString()) # Chicago userid_map_service = IDMapService(id_func) productid_map_service = IDMapService(id_func) review_set = review_set_pb2.ReviewSet() with open( os.path.normpath( data_directory + '/YelpData/YelpCHI/output_review_yelpHotelData_NRYRcleaned.txt' ), 'r') as f1: with open( os.path.normpath( data_directory + '/YelpData/YelpCHI/output_meta_yelpHotelData_NRYRcleaned.txt' ), 'r') as f2: for line in f1: format_yelp_chi_review(review_set.reviews.add(), line, f2.readline(), userid_map_service, productid_map_service) with open( os.path.normpath(data_directory + '/normalizedData/yelpCHI-hotels'), 'w') as f: f.write(str(review_set)) userid_map_service = IDMapService(id_func) productid_map_service = IDMapService(id_func) review_set = review_set_pb2.ReviewSet() with open( os.path.normpath( data_directory + '/YelpData/YelpCHI/output_review_yelpResData_NRYRcleaned.txt'), 'r') as f1: with open( os.path.normpath( data_directory + '/YelpData/YelpCHI/output_meta_yelpResData_NRYRcleaned.txt' ), 'r') as f2: for line in f1: format_yelp_chi_review(review_set.reviews.add(), line, f2.readline(), userid_map_service, productid_map_service) with open( os.path.normpath(data_directory + '/normalizedData/yelpCHI-restaurants'), 'w') as f: f.write(str(review_set)) # Amazon review_set = review_set_pb2.ReviewSet() userid_map_service = IDMapService(id_func) productid_map_service = IDMapService(id_func) with open(os.path.normpath(data_directory + '/amazonBooks/reviewContent'), 'r') as f: for line in f: reviewObj = json.loads(line.replace('},', '}')) format_amazonBooks_review(review_set.reviews.add(), reviewObj, userid_map_service, productid_map_service) with open(os.path.normpath(data_directory + '/normalizedData/amazonBooks'), 'w') as f: f.write(str(review_set))
def test_formatting_yelp_nyc_gives_correct_review_content(): review = review_pb2.Review() entry = _get_entry_text(content="Blarg") format_yelp_nyc_review(review, entry, _get_nyc_metadata()) assert review.review_content == "Blarg"
def test_formatting_yelp_nyc_gives_correct_rating(): review = review_pb2.Review() entry = _get_entry_text() format_yelp_nyc_review(review, entry, _get_nyc_metadata(rating=1.0)) assert review.rating == 1
def test_formatting_yelp_nyc_gives_correct_date(): review = review_pb2.Review() entry = _get_entry_text(date="2001-02-03") format_yelp_nyc_review(review, entry, _get_nyc_metadata()) assert review.date == "2001-02-03"
def test_formatting_yelp_nyc_gives_correct_product_id(): review = review_pb2.Review() entry = _get_entry_text(productid=3) format_yelp_nyc_review(review, entry, _get_nyc_metadata()) assert review.product_id == 3
def test_formatting_yelp_nyc_gives_correct_user_id(): review = review_pb2.Review() entry = _get_entry_text(userid=2) format_yelp_nyc_review(review, entry, _get_nyc_metadata()) assert review.user_id == 2