def get_categories(file_path): records = ETLUtils.load_json_file(file_path) # Now we obtain the categories for all the businesses records = ETLUtils.add_transpose_list_column('categories', records) BusinessETL.drop_unwanted_fields(records) return records[0].keys()
def create_category_matrix(file_path): """ Creates a matrix with all the categories for businesses that are contained in the Yelp Phoenix Business data set. Each column of the matrix represents a category, and each row a business. This is a binary matrix that contains a 1 at the position i,j if the business i contains the category j, and a 0 otherwise. :rtype : numpy array matrix :param file_path: the path for the file that contains the businesses data :return: a numpy array binary matrix """ records = ETLUtils.load_json_file(file_path) # Now we obtain the categories for all the businesses records = ETLUtils.add_transpose_list_column('categories', records) BusinessETL.drop_unwanted_fields(records) matrix = numpy.array( [numpy.array(record.values()) for record in records]) return matrix