def each_cluster(locations, users): sorted_locations = sorted(locations.values(), key=lambda x:x.cluster1) groups = {x:list(y) for x, y in itertools.groupby(sorted_locations, lambda x:x.cluster1)} # for each cluster for c, a_group in groups.items(): print("In layer 2 - cluster:", c, ", #:", len(a_group)) corpus = [] for a_location in a_group: doc = " ".join([" ".join(x.tags) for x in a_location.posts]) corpus.append(doc) tfidf, tags_name = clda.get_tfidf(corpus) cntr, u, u0, d, jm, p, fpc, membership = cfuzzy.cmeans(tfidf.T, CLUSTER_NUM_2) #set_location_cluster(a_group, membership, "cluster2") output_on_map([(float(x.lat), float(x.lng), x.lname) for x in a_group], membership, CLUSTER_NUM_2, "./data/Summary/map_cluster3_" + str(c) + ".html")
print("--------------------------------------") # setting cluster number if len(sys.argv) > 1: CLUSTER_NUM = int(sys.argv[1]) locations = clocation.get_locations_list() users = cuser.get_users_posts_afile(USER_POSTS_FILE) fit_users_to_location(locations, users) coordinate = numpy.array([(float(x.lat), float(x.lng)) for x in locations.values()]) #print("coordinate.shape:", coordinate.shape) # tags distance: tfidf corpus = get_corpus(locations.values()) tfidf, tags_name = clda.get_tfidf(corpus) print("END getting data:", datetime.datetime.now()) tfidf, tags_name = filter_tag(tfidf.T, tags_name) #print("tfidf:", tfidf.shape) cntr1, cntr2, u, u0, d1, d2, d, jm, p, fpc, cluster_membership = cfuzzy.cmeans_comb( coordinate.T, tfidf, CLUSTER_NUM, WEIGHT, ERROR) output_on_map([(float(x.lat), float(x.lng), x.lname) for x in locations.values()] \ , cluster_membership, CLUSTER_NUM, OUTPUT_MAP) for i, key in enumerate(locations.keys()): setattr(locations[key], "cluster1", cluster_membership[i]) output_location_cluster(locations, OUTPUT_LOCATION_CLUSTER) #set_location_cluster(locations, membership, "cluster1")
print("STARTTIME:", (datetime.datetime.now())) print("--------------------------------------") # setting cluster number if len(sys.argv) > 1: CLUSTER_NUM = int(sys.argv[1]) locations = clocation.get_locations_list() users = cuser.get_users_posts_afile(USER_POSTS_FILE) fit_users_to_location(locations, users) coordinate = numpy.array([(float(x.lat), float(x.lng)) for x in locations.values()]) #print("coordinate.shape:", coordinate.shape) # tags distance: tfidf corpus = get_corpus(locations.values()) tfidf, tags_name = clda.get_tfidf(corpus) print("END getting data:", datetime.datetime.now()) tfidf, tags_name = filter_tag(tfidf.T, tags_name) #print("tfidf:", tfidf.shape) cntr1, cntr2, u, u0, d1, d2, d, jm, p, fpc, cluster_membership = cfuzzy.cmeans_comb(coordinate.T, tfidf, CLUSTER_NUM, WEIGHT, ERROR) output_on_map([(float(x.lat), float(x.lng), x.lname) for x in locations.values()] \ , cluster_membership, CLUSTER_NUM, OUTPUT_MAP) for i, key in enumerate(locations.keys()): setattr(locations[key], "cluster1", cluster_membership[i]) output_location_cluster(locations, OUTPUT_LOCATION_CLUSTER) #set_location_cluster(locations, membership, "cluster1") #each_cluster(locations, users)