def main(*argv): start_time = datetime.datetime.now() print("--------------------------------------") print("STARTTIME:", (datetime.datetime.now())) print("--------------------------------------") # set parameters global CLUSTER_NUM global MAX_KTH global GPS_WEIGHT global FILTER_TIME_S global FILTER_TIME_E global OUTPUT_MAP global OUTPUT_PATTERN if len(argv) > 0: CLUSTER_NUM = argv[0] MAX_KTH = argv[1] GPS_WEIGHT = argv[2] FILTER_TIME_S = argv[3] FILTER_TIME_E = argv[4] LOCATION_TOPIC = "./data/LocationTopic/LocationTopic_OCT_c35.txt" else: LOCATION_TOPIC = "./data/LocationTopic/LocationTopic_NOVDEC_c35.txt" OUTPUT_MAP = OUTPUT_MAP + str(CLUSTER_NUM) + "k" + str(MAX_KTH) + "w" + str(GPS_WEIGHT) OUTPUT_PATTERN = OUTPUT_PATTERN + str(CLUSTER_NUM) + "k" + str(MAX_KTH) # Getting data users, locations = locationclustering.main(FILTER_TIME_S, FILTER_TIME_E) location_id, doc_topic = ccluster.open_doc_topic(LOCATION_TOPIC) locations = ccluster.fit_locations_membership(locations, numpy.transpose(doc_topic), location_id, "semantic_mem") print(" users # :", len(users)) # Getting sequences of posts & locations #sequences = ctrajectory.split_trajectory([a_user.posts for a_user in users.values() if len(a_user.posts) != 0], SPLIT_DAY) sequences = ctrajectory.split_trajectory_byday([a_user.posts for a_user in users.values() if len(a_user.posts) != 0]) sequences = ctrajectory.remove_adjacent_location(sequences) #sequences = get_specific(sequences) sequences = ctrajectory.remove_short(sequences) print(" remain users #:", len(set([x[0].uid for x in sequences]))) location_sequences, longest_len = ctrajectory.convertto_location_sequences(sequences, locations) spatial_array = ctrajectory.get_vector_array(location_sequences, longest_len) semantic_array = ctrajectory.get_vector_array(location_sequences, longest_len, "semantic_mem") u, u0, d, jm, p, fpc, center, membership = cfuzzy.sequences_clustering_i("Location", spatial_array, CLUSTER_NUM, MAX_KTH, semantic_array, GPS_WEIGHT, e = ERROR, algorithm="2WeightedDistance") #sequences, location_sequences, membership = filter_sequence(sequences, location_sequences, u, membership) #ouput_pattern(sequences, location_sequences, u, membership) #output_each_pattern(sequences, location_sequences, u, membership, 30) #ctrajectory.output_clusters(sequences, membership, u, OUTPUT_PATTERN) #output_cluster_detail(sequences, location_sequences, u, membership, 39, file = OUTPUT_ANALYSIS) print("--------------------------------------") print("ENDTIME:", (datetime.datetime.now()), ", SPEND:", datetime.datetime.now() - start_time) print("--------------------------------------") return location_sequences, spatial_array, semantic_array, u
def main(): print("--------------------------------------") print("STARTTIME:", (datetime.datetime.now())) print("--------------------------------------") # Getting data users, locations = locationclustering.main() location_id, doc_topic = ccluster.open_doc_topic(LOCATION_TOPIC) locations = ccluster.fit_locations_membership(locations, numpy.transpose(doc_topic), location_id, "semantic_mem") semantic_cluster = numpy.argmax(doc_topic, axis=1) locations = ccluster.fit_locations_cluster(locations, semantic_cluster, location_id, "semantic_cluster") # Getting sequences cluster sequences = ctrajectory.split_trajectory( [a_user.posts for a_user in users.values() if len(a_user.posts) != 0], SPLIT_DAY ) cluster_sequences = ctrajectory.get_cluster_sequence(sequences, locations) semantic_sequences = ctrajectory.get_cluster_sequence(sequences, locations, "semantic_cluster") location_sequences = ctrajectory.convertto_location_sequences(sequences, locations) print("Filtering short trajectories...") fail_indices = [] for i, s in enumerate(sequences): if len(s) <= 2: fail_indices.append(i) print(" will delete #:", len(fail_indices)) sequences = numpy.delete(numpy.array(sequences), fail_indices) cluster_sequences = numpy.delete(numpy.array(cluster_sequences), fail_indices) semantic_sequences = numpy.delete(numpy.array(semantic_sequences), fail_indices) location_sequences = numpy.delete(numpy.array(location_sequences), fail_indices) print(" remain sequences #:", len(sequences)) u, u0, d, jm, p, fpc, membership, distance = cfuzzy.sequences_clustering( "Cluster", cluster_sequences, CLUSTER_NUM, MAX_KTH, semantic_sequences, e=ERROR, algorithm="2Distance" ) print("Start Outputting...") for c in range(CLUSTER_NUM): this_cluster_indices = [i for i, x in enumerate(membership) if x == c] print(c, " >> this cluster #:", len(this_cluster_indices)) if len(this_cluster_indices) is not 0: top_10_u = sorted(u[c, this_cluster_indices], reverse=True) if len(top_10_u) >= MAX_KTH: top_10_u = top_10_u[MAX_KTH - 1] else: top_10_u = top_10_u[-1] top_10_indices = [i for i, x in enumerate(u[c, this_cluster_indices]) if x >= top_10_u] # top_10_indices = sorted(range(len(u[c, this_cluster_indices])), key=lambda x: u[c, this_cluster_indices][x], reverse=True)[0:10] print(" top_10:", top_10_u, ">", top_10_indices) print(u[c, this_cluster_indices][top_10_indices]) points_sequences = numpy.array(location_sequences)[this_cluster_indices][top_10_indices] color = sorted(range(len(points_sequences)), key=lambda x: top_10_indices[x]) cpygmaps.output_patterns_l( points_sequences, color, len(points_sequences), OUTPUT_MAP + "_" + str(c) + ".html" ) print("--------------------------------------") print("ENDTIME:", (datetime.datetime.now())) print("--------------------------------------")
def main(*argv): print("--------------------------------------") print("STARTTIME:", (datetime.datetime.now())) print("--------------------------------------") # set UNIXTIME global FILTER_TIME_S global FILTER_TIME_E if len(argv) > 0: FILTER_TIME_S = argv[0] FILTER_TIME_E = argv[1] # getting data locations = clocation.open_locations() users = cuser.open_users_posts_afile(USER_POSTS_FILE) # preporcessing. remove unqualified users removes = filter_users_timeperiod(users, FILTER_TIME_S, FILTER_TIME_E) sequences = ctrajectory.split_trajectory_byday([a_user.posts for a_user in users.values() if len(a_user.posts) != 0]) sequences = ctrajectory.remove_adjacent_location(sequences) sequences = ctrajectory.remove_short(sequences) removes = list(set(removes) | (set(users.keys()) - set([x[0].uid for x in sequences]))) for key in removes: del users[key] print(" remain users #:", len(users.keys())) locations = clocation.fit_users_to_location(locations, users, "uid") set_location_user_count(locations) coordinate = numpy.array([(float(x.lat), float(x.lng)) for x in locations.values()]) location_frequency = numpy.array([x.usercount for x in locations.values()]) # clustering locations cntr, u, u0, d, jm, p, fpc, membership = cfuzzy.cmeans_location(coordinate.T, CLUSTER_NUM, MAX_KTH, location_frequency, Y, e=ERROR, algorithm="kthCluster_LocationFrequency") locations = ccluster.fit_locations_membership(locations, u, locations.keys()) locations = ccluster.fit_locations_cluster(locations, membership, locations.keys()) # output result """ cpygmaps.output_clusters(\ [(float(x.lat), float(x.lng), str(x.cluster) + " >> " + x.lname + "(" + x.lid + ")>>u:" + str(u[x.cluster, i])) \ for i, x in enumerate(locations.values())], membership, CLUSTER_NUM, OUTPUT_MAP) output_representatives(coordinate, u, MAX_KTH) """ #ccluster.output_location_cluster(locations.values(), "cluster", OUTPUT_CLUSTER) print("--------------------------------------") print("ENDTIME:", (datetime.datetime.now())) print("--------------------------------------") return users, locations
def main(): print("--------------------------------------") print("STARTTIME:", (datetime.datetime.now())) print("--------------------------------------") # Getting data users, locations = locationclustering.main() location_id, doc_topic = ccluster.open_doc_topic(LOCATION_TOPIC) locations = ccluster.fit_locations_membership(locations, numpy.transpose(doc_topic), location_id, "semantic_mem") # Getting sequences cluster sequences = ctrajectory.split_trajectory( [a_user.posts for a_user in users.values() if len(a_user.posts) != 0], SPLIT_DAY) vector_sequences = ctrajectory.get_vector_sequence(sequences, locations) semantic_sequences = ctrajectory.get_vector_sequence( sequences, locations, "semantic_mem") location_sequences = ctrajectory.convertto_location_sequences( sequences, locations) print("Filtering short trajectories...") fail_indices = [] for i, s in enumerate(sequences): if len(s) <= 2: fail_indices.append(i) print(" will delete #:", len(fail_indices)) sequences = numpy.delete(numpy.array(sequences), fail_indices) vector_sequences = numpy.delete(numpy.array(vector_sequences), fail_indices) semantic_sequences = numpy.delete(numpy.array(semantic_sequences), fail_indices) location_sequences = numpy.delete(numpy.array(location_sequences), fail_indices) print(" remain sequences #:", len(sequences)) u, u0, d, jm, p, fpc, membership, distance = cfuzzy.sequences_clustering( "Location", vector_sequences, CLUSTER_NUM, MAX_KTH, semantic_sequences, e=ERROR, algorithm="2Distance") print("Start Outputting...") for c in range(CLUSTER_NUM): this_cluster_indices = [i for i, x in enumerate(membership) if x == c] print(c, " >> this cluster #:", len(this_cluster_indices)) if len(this_cluster_indices) is not 0: top_10_u = sorted(u[c, this_cluster_indices], reverse=True) if len(top_10_u) >= MAX_KTH: top_10_u = top_10_u[MAX_KTH - 1] else: top_10_u = top_10_u[-1] top_10_indices = [ i for i, x in enumerate(u[c, this_cluster_indices]) if x >= top_10_u ] #top_10_indices = sorted(range(len(u[c, this_cluster_indices])), key=lambda x: u[c, this_cluster_indices][x], reverse=True)[0:10] print(" top_10:", top_10_u, ">", top_10_indices) print(u[c, this_cluster_indices][top_10_indices]) points_sequences = numpy.array( location_sequences)[this_cluster_indices][top_10_indices] color = sorted(range(len(points_sequences)), key=lambda x: top_10_indices[x]) cpygmaps.output_patterns_l(points_sequences, color, len(points_sequences), OUTPUT_MAP + "_" + str(c) + ".html") print("--------------------------------------") print("ENDTIME:", (datetime.datetime.now())) print("--------------------------------------")