data_folder = '../../../../../../datasets/yelp_phoenix_academic_dataset/'
business_file_path = data_folder + 'yelp_academic_dataset_business.json'
my_matrix = BusinessETL.create_category_matrix(business_file_path)
my_sets = BusinessETL.create_category_sets(business_file_path)
print 'Data pre-processing done'

# Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-scikit')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-nltk')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'mean-shift')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'ward')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'dbscan')
my_labels = Clusterer.cluster_data(my_matrix, 'dbscan')
my_categories = get_categories(business_file_path)

size = len(set(my_labels))
clusters = [[] for i in range(size)]

for i in xrange(len(my_labels)):
    if my_labels[i] == -1:
        clusters[size-1].append(binary_to_categories(my_matrix[i], my_categories))
    else:
        clusters[int(my_labels[i])].append(binary_to_categories(my_matrix[i], my_categories))
    # print my_labels[i]
# Clusterer.linkage(my_matrix[:3000])
# Clusterer.gaac(my_matrix[:500][:50])

sets = []
Example #2
0
    def clustering(file_path):

        vectorized = TipTfidf.tf_idf_tips(file_path)
        Clusterer.cluster_and_evaluate_data(vectorized, 'k-means-scikit')
Example #3
0



data_folder = '../../../../../../datasets/yelp_phoenix_academic_dataset/'
business_file_path = data_folder + 'yelp_academic_dataset_business.json'
my_matrix = BusinessETL.create_category_matrix(business_file_path)
my_sets = BusinessETL.create_category_sets(business_file_path)
print 'Data pre-processing done'

# Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-scikit')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'k-means-nltk')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'mean-shift')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'ward')
# Clusterer.cluster_and_evaluate_data(my_matrix, 'dbscan')
my_labels = Clusterer.cluster_data(my_matrix, 'dbscan')
my_categories = get_categories(business_file_path)

size = len(set(my_labels))
clusters = [[] for i in range(size)]

for i in xrange(len(my_labels)):
    if my_labels[i] == -1:
        clusters[size-1].append(binary_to_categories(my_matrix[i], my_categories))
    else:
        clusters[int(my_labels[i])].append(binary_to_categories(my_matrix[i], my_categories))
    # print my_labels[i]
# Clusterer.linkage(my_matrix[:3000])
# Clusterer.gaac(my_matrix[:500][:50])

# counts = count_categories(clusters)
Example #4
0
    def clustering(file_path):

        vectorized = TipTfidf.tf_idf_tips(file_path)
        Clusterer.cluster_and_evaluate_data(vectorized, 'k-means-scikit')