def get_categories(file_path):
    records = ETLUtils.load_json_file(file_path)

    # Now we obtain the categories for all the businesses
    records = ETLUtils.add_transpose_list_column('categories', records)
    BusinessETL.drop_unwanted_fields(records)

    return records[0].keys()
Esempio n. 2
0
def get_categories(file_path):
    records = ETLUtils.load_json_file(file_path)

    # Now we obtain the categories for all the businesses
    records = ETLUtils.add_transpose_list_column('categories', records)
    BusinessETL.drop_unwanted_fields(records)

    return records[0].keys()
    for i in xrange(num_categories):
        if binary_list[i] == 1:
            category_list.append(categories[i])

    # print category_list

    return category_list






data_folder = '../../../../../../datasets/yelp_phoenix_academic_dataset/'
business_file_path = data_folder + 'yelp_academic_dataset_business.json'
my_matrix = BusinessETL.create_category_matrix(business_file_path)
my_sets = BusinessETL.create_category_sets(business_file_path)
print 'Data pre-processing done'

# Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-scikit')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-nltk')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'mean-shift')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'ward')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'dbscan')
my_labels = Clusterer.cluster_data(my_matrix, 'dbscan')
my_categories = get_categories(business_file_path)

size = len(set(my_labels))
clusters = [[] for i in range(size)]

for i in xrange(len(my_labels)):
Esempio n. 4
0
            if review['business_id'] in business_ids:
                filtered_reviews.append(review)

        return filtered_reviews

    @staticmethod
    def sort_records(records, field, reverse=False):
        return sorted(records, key=itemgetter(field), reverse=reverse)



start = time.time()

review_etl = ReviewETL()
my_business_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_business.json"
my_reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review.json"
my_business_ids = BusinessETL.get_business_ids(my_business_file, 'Hotels')
my_reviews = ETLUtils.load_json_file(my_reviews_file)
# print(len(ReviewETL.filter_reviews_by_business(my_reviews, my_business_ids, 'text')))
my_restaurant_reviews = ReviewETL.filter_reviews_by_business_slow(my_reviews, my_business_ids)
my_restaurants_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json"
ETLUtils.save_json_file(my_restaurants_file, my_restaurant_reviews)
# my_sorted_reviews = ReviewETL.sort_records(my_reviews, 'business_id')
# print(len(my_sorted_reviews))


# main()
end = time.time()
total_time = end - start
print("Total time = %f seconds" % total_time)
Esempio n. 5
0
    return category_list


def count_categories(cluster_list):
    counted_clusters = []
    for cluster in cluster_list:
        counted_clusters.append(Counter(list(itertools.chain(*cluster))))
    return counted_clusters





data_folder = '../../../../../../datasets/yelp_phoenix_academic_dataset/'
business_file_path = data_folder + 'yelp_academic_dataset_business.json'
my_matrix = BusinessETL.create_category_matrix(business_file_path)
my_sets = BusinessETL.create_category_sets(business_file_path)
print 'Data pre-processing done'

# Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-scikit')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-nltk')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'mean-shift')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'ward')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'dbscan')
my_labels = Clusterer.cluster_data(my_matrix, 'dbscan')
my_categories = get_categories(business_file_path)

size = len(set(my_labels))
clusters = [[] for i in range(size)]

for i in xrange(len(my_labels)):