def main(): root_folder = raw_input("Input the root_folder name:") """ Firstly, extract all .csv input file names from {root_folder}/input/*.csv """ # filenames = ["8514019.csv", "9116943.csv", "9267118.csv", "9443140.csv", "9383986.csv", "9343340.csv", "9417464.csv", "9664225.csv", "9538440.csv", "9327138.csv"] # filenames = ["9664225.csv"] # filenames = ["8514019.csv"] filenames = [] for input_filename in os.listdir("{root_folder}/input/".format(root_folder = root_folder)): if (input_filename.find(".csv") != -1): filenames.append(input_filename) """ Get min distance between vessels """ need_compute_mindistance = raw_input("Need to compute min_distance_matrix for vessel interaction? (y/n) :") == 'y' if (need_compute_mindistance): """sort the aggregateData with MMSI based on TS""" data_with_mmsi = writeToCSV.readDataFromCSVWithMMSI(path = root_folder + "/cleanedData", filename = "aggregateData_with_mmsi.csv") data_with_mmsi_sorted = compute_mindistance.sortDataBasedOnTS(data_with_mmsi) writeToCSV.writeDataToCSVWithMMSI(data_with_mmsi_sorted, root_folder + "/cleanedData", "aggregateData_with_mmsi_sorted") """Apply the computing of min distance using a timed window""" data_with_mmsi_sorted = writeToCSV.readDataFromCSVWithMMSI(path = root_folder + "/cleanedData", filename = "aggregateData_with_mmsi_sorted.csv") mmsi_set = compute_mindistance.getSetOfMMSI(data_with_mmsi_sorted) print mmsi_set print list(mmsi_set) start_time = time.time() mmsi_list_dict, min_distance_matrix, vessel_distance_speed_dict = \ compute_mindistance.computeVesselMinDistanceMatrix(data_with_mmsi_sorted, TIME_WINDOW = 1800) writeToCSV.saveData([{ \ 'mmsi_list_dict': mmsi_list_dict, \ 'min_distance_matrix': min_distance_matrix, \ 'vessel_distance_speed_dict': vessel_distance_speed_dict }], filename = root_folder + "/cleanedData" + "/min_distance_matrix_with_mmsi_time_window_1800_sec") print "time spent:", time.time() - start_time """From already computed""" # min_distance_matrix_result = writeToCSV.loadData(\ # root_folder + "/cleanedData" + "/min_distance_matrix_with_mmsi_time_window_1800_sec.npz") # print "min_distance_matrix_result type:\n", type(min_distance_matrix_result) # mmsi_list_dict = min_distance_matrix_result[0]["mmsi_list_dict"] # min_distance_matrix = min_distance_matrix_result[0]["min_distance_matrix"] # vessel_distance_speed_dict = min_distance_matrix_result[0]["vessel_distance_speed_dict"] # print "min_distance_matrix loaded:\n", min_distance_matrix # min_of_min_distance = sys.maxint # for i in range(0, min_distance_matrix.shape[0]): # for j in range(i + 1, min_distance_matrix.shape[1]): # if (min_distance_matrix[i][j] < min_of_min_distance): # min_of_min_distance = min_distance_matrix[i][j] # print "min_distance_matrix min of 10 tankers:", min_of_min_distance """write min distance records for Agent Based Simulator""" writeToCSV.writeVesselSpeedToDistance(\ path = utils.queryPath(root_folder+"LearningResult"),\ file_name = "vessel_speed_to_distance", \ vessel_distance_speed_dict = vessel_distance_speed_dict) writeToCSV.writeVesselMinDistanceMatrix(\ path = utils.queryPath(root_folder+"LearningResult"), \ file_name = "vessel_min_distance_matrix", \ mmsi_list_dict = mmsi_list_dict, \ min_distance_matrix = min_distance_matrix) writeToCSV.writeMMSIs(\ path = utils.queryPath(root_folder+"LearningResult"), \ file_name = "mmsi_list", \ mmsi_list = [key for key, index in mmsi_list_dict.iteritems()]) """ Test Clustering """ # trajectories_to_cluster = writeToCSV.loadData(root_folder + "/" + "all_OD_trajectories_with_1D_data_refined.npz") # # trajectories_to_cluster = writeToCSV.loadData(root_folder + "/" + "all_OD_trajectories_cleaned.npz") # # trajectories_to_cluster = writeToCSV.loadData(root_folder + "/" + "all_OD_trajectories_9664225.npz") # print "trajectories_to_cluster.shape: ", trajectories_to_cluster.shape # print "type(trajectories_to_cluster): ", type(trajectories_to_cluster) # print "len(trajectories_to_cluster): ", len(trajectories_to_cluster) # # convert Lat, Lon to XY for clustering # all_OD_trajectories_XY = convertListOfTrajectoriesToXY(utils.CENTER_LAT_SG, utils.CENTER_LON_SG, trajectories_to_cluster) # executeClustering(root_folder = root_folder, \ # all_OD_trajectories_XY = all_OD_trajectories_XY, \ # reference_lat = utils.CENTER_LAT_SG, \ # reference_lon = utils.CENTER_LON_SG, \ # filenames = filenames) # raise ValueError("purpose stop for testing clustering") """ plot out the value space of the features, speed, accelerations, etc, for the aggregateData """ # filename = "aggregateData.npz" # path = "tankers/cleanedData" # data = writeToCSV.loadArray("{p}/{f}".format(p = path, f=filename)) # for trajectory in trajectories_to_cluster: # plotter.plotFeatureSpace(trajectory) # raise ValueError("For plotting feature space only") """ Read the cleaned .csv input files form {root_folder}/cleanedData/ Extract endpoints """ endpoints = None all_OD_trajectories = [] utils.queryPath("{root_folder}/endpoints".format(root_folder = root_folder)) utils.queryPath("{root_folder}/trajectories".format(root_folder = root_folder)) for i in range(0, len(filenames)): this_vessel_trajectory_points = writeToCSV.readDataFromCSV(root_folder + "/cleanedData", filenames[i]) # Extract end points, along with MMSI this_vessel_endpoints = np.asarray(extractEndPoints(writeToCSV.readDataFromCSVWithMMSI(root_folder + "/cleanedData", filenames[i]))) # Save end points, along with MMSI writeToCSV.writeDataToCSVWithMMSI( \ this_vessel_endpoints, \ root_folder + "/endpoints", \ "{filename}_endpoints".format(filename = filenames[i][:filenames[i].find(".")])) print "this_vessel_endpoints.shape:", this_vessel_endpoints.shape # Append to the total end points if(endpoints is None): endpoints = this_vessel_endpoints else: endpoints = np.concatenate((endpoints, this_vessel_endpoints), axis=0) for s in range (0, len(this_vessel_endpoints) - 1): originLatitude = this_vessel_endpoints[s][utils.dataDict["latitude"]] originLongtitude = this_vessel_endpoints[s][utils.dataDict["longitude"]] origin_ts = this_vessel_endpoints[s][utils.dataDict["ts"]] endLatitude = this_vessel_endpoints[s + 1][utils.dataDict["latitude"]] endLongtitude = this_vessel_endpoints[s + 1][utils.dataDict["longitude"]] end_ts = this_vessel_endpoints[s + 1][utils.dataDict["ts"]] """Extracting trajectory between a pair of OD""" print "\n\nextracting endpoints between ", s, " and ", s + 1 OD_trajectories, OD_trajectories_lat_lon = extractTrajectoriesUntilOD(\ this_vessel_trajectory_points, \ origin_ts, \ originLatitude, \ originLongtitude, \ end_ts, \ endLatitude, \ endLongtitude, \ show = False, save = True, clean = False, \ fname = filenames[i][:filenames[i].find(".")] + "_trajectory_between_endpoint{s}_and{e}".format(s = s, e = s + 1)) # there will be one trajectory between each OD assert (len(OD_trajectories) > 0), "OD_trajectories extracted must have length > 0" print "number of trajectory points extracted : ", len(OD_trajectories[0]) if(len(OD_trajectories[0]) > 2): # more than just the origin and destination endpoints along the trajectory writeToCSV.writeDataToCSV( \ data = OD_trajectories_lat_lon[0], path = root_folder + "/trajectories", \ file_name = "{filename}_trajectory_endpoint_{s}_to_{e}".format(filename = filenames[i][:filenames[i].find(".")], \ s = s, \ e = s + 1)) """ Interpolation based on pure geographical trajectory, ignore temporal information """ interpolated_OD_trajectories = interpolator.geographicalTrajetoryInterpolation(OD_trajectories) plotter.plotListOfTrajectories( \ interpolated_OD_trajectories, \ show = False, \ clean = True, \ save = True, \ fname = filenames[i][:filenames[i].find(".")] + "_interpolated_algo_3_between_endpoint{s}_and{e}".format(\ s = s, \ e = s + 1)) """ Interpolation of 1D data: speed, rate_of_turn, etc; interpolated_OD_trajectories / OD_trajectories are both in X, Y coordinates """ if(len(interpolated_OD_trajectories) > 0): interpolated_OD_trajectories[0] = interpolator.interpolate1DFeatures( \ interpolated_OD_trajectories[0], \ OD_trajectories[0]) # change X, Y coordinate to Lat, Lon interpolated_OD_trajectories_lat_lon = convertListOfTrajectoriesToLatLon( \ originLatitude, originLongtitude, interpolated_OD_trajectories) if(len(interpolated_OD_trajectories_lat_lon) > 0): # since there should be only one trajectory between each pair of OD all_OD_trajectories.append(interpolated_OD_trajectories_lat_lon[0]) else: print "no trajectories extracted between endpoints ", s , " and ", s + 1 plt.clf() assert (not endpoints is None), "Error!: No endpoints extracted from the historial data of vessels" + "_".join(filenames) print "Final endpoints.shape:", endpoints.shape print "number of interpolated all_OD_trajectories:", len(all_OD_trajectories) """ save the augmented trajectories between endpoints as npz data file and the plot """ # remove error trajectories that are too far from Singapore all_OD_trajectories = utils.removeErrorTrajectoryFromList(all_OD_trajectories) writeToCSV.saveData(all_OD_trajectories, root_folder + "/all_OD_trajectories_with_1D_data") # convert Lat, Lon to XY for displaying all_OD_trajectories_XY = convertListOfTrajectoriesToXY(utils.CENTER_LAT_SG, utils.CENTER_LON_SG, all_OD_trajectories) plotter.plotListOfTrajectories(all_OD_trajectories_XY, show = False, clean = True, save = True, \ fname = "{root_folder}_all_OD_trajectories".format(root_folder = root_folder)) """ Execute Clustering """ executeClustering(root_folder = root_folder, \ all_OD_trajectories_XY = all_OD_trajectories_XY, \ reference_lat = utils.CENTER_LAT_SG, \ reference_lon = utils.CENTER_LON_SG, \ filenames = filenames)
def main(): metric_to_use = int( raw_input("use metric?\n" + "1. l2\n" + "2. center of mass\n")) root_folder = "tankers/out_sample_test" """read centroids""" centroids = None if (metric_to_use == 1): centroids = writeToCSV.loadData( "tankers/cleanedData/centroids_arr_l2.npz") elif (metric_to_use == 2): centroids = writeToCSV.loadData( "tankers/cleanedData/centroids_arr_center_mass.npz") """Extract endpoints, trajectories, augmentation""" filenames = [ "9050462.csv", "9259769.csv", "9327138.csv", "9408475.csv", "9417464.csv", "9548440.csv" ] # for out sample test # filenames = ["9408475.csv"] endpoints = None all_OD_trajectories = [] """Do the augmentation if not yet done""" if (not os.path.exists(root_folder + "/all_OD_trajectories_with_1D_data.npz")): for i in range(0, len(filenames)): this_vessel_trajectory_points = writeToCSV.readDataFromCSV( root_folder + "/cleanedData", filenames[i]) # Extract end points, along with MMSI this_vessel_endpoints = np.asarray( trajectory_modeller.extractEndPoints( writeToCSV.readDataFromCSVWithMMSI( root_folder + "/cleanedData", filenames[i]))) # Save end points, along with MMSI writeToCSV.writeDataToCSVWithMMSI( \ this_vessel_endpoints, \ utils.queryPath(root_folder + "/endpoints"), \ "{filename}_endpoints".format(filename = filenames[i][:filenames[i].find(".")])) print "this_vessel_endpoints.shape:", this_vessel_endpoints.shape # Append to the total end points if (endpoints is None): endpoints = this_vessel_endpoints else: endpoints = np.concatenate((endpoints, this_vessel_endpoints), axis=0) for s in range(0, len(this_vessel_endpoints) - 1): originLatitude = this_vessel_endpoints[s][ utils.dataDict["latitude"]] originLongtitude = this_vessel_endpoints[s][ utils.dataDict["longitude"]] origin_ts = this_vessel_endpoints[s][utils.dataDict["ts"]] endLatitude = this_vessel_endpoints[s + 1][ utils.dataDict["latitude"]] endLongtitude = this_vessel_endpoints[s + 1][ utils.dataDict["longitude"]] end_ts = this_vessel_endpoints[s + 1][utils.dataDict["ts"]] """Extracting trajectory between a pair of OD""" print "\n\nextracting endpoints between ", s, " and ", s + 1 OD_trajectories, OD_trajectories_lat_lon = trajectory_modeller.extractTrajectoriesUntilOD(\ this_vessel_trajectory_points, \ origin_ts, \ originLatitude, \ originLongtitude, \ end_ts, \ endLatitude, \ endLongtitude, \ show = False, save = True, clean = False, \ fname = filenames[i][:filenames[i].find(".")] + "_trajectory_between_endpoint{s}_and{e}".format(s = s, e = s + 1), \ path = utils.queryPath(root_folder + "/plots")) # there will be one trajectory between each OD assert (len(OD_trajectories) > 0), "OD_trajectories extracted must have length > 0" print "number of trajectory points extracted : ", len( OD_trajectories[0]) if ( len(OD_trajectories[0]) > 2 ): # more than just the origin and destination endpoints along the trajectory writeToCSV.writeDataToCSV( \ data = OD_trajectories_lat_lon[0], path = utils.queryPath(root_folder + "/trajectories"), \ file_name = "{filename}_trajectory_endpoint_{s}_to_{e}".format(filename = filenames[i][:filenames[i].find(".")], \ s = s, \ e = s + 1)) """ Interpolation based on pure geographical trajectory, ignore temporal information """ interpolated_OD_trajectories = interpolator.geographicalTrajetoryInterpolation( OD_trajectories) plotter.plotListOfTrajectories( \ interpolated_OD_trajectories, \ show = False, \ clean = True, \ save = True, \ fname = filenames[i][:filenames[i].find(".")] + "_interpolated_algo_3_between_endpoint{s}_and{e}".format(\ s = s, \ e = s + 1), \ path = utils.queryPath(root_folder + "/plots")) """ Interpolation of 1D data: speed, rate_of_turn, etc; interpolated_OD_trajectories / OD_trajectories are both in X, Y coordinates """ if (len(interpolated_OD_trajectories) > 0): interpolated_OD_trajectories[0] = interpolator.interpolate1DFeatures( \ interpolated_OD_trajectories[0], \ OD_trajectories[0]) # change X, Y coordinate to Lat, Lon interpolated_OD_trajectories_lat_lon = trajectory_modeller.convertListOfTrajectoriesToLatLon( \ originLatitude, originLongtitude, interpolated_OD_trajectories) if (len(interpolated_OD_trajectories_lat_lon) > 0): # since there should be only one trajectory between each pair of OD all_OD_trajectories.append( interpolated_OD_trajectories_lat_lon[0]) else: print "no trajectories extracted between endpoints ", s, " and ", s + 1 plt.clf() assert ( not endpoints is None ), "Error!: No endpoints extracted from the historial data of vessels" + "_".join( filenames) print "Final endpoints.shape:", endpoints.shape print "number of interpolated all_OD_trajectories:", len( all_OD_trajectories) all_OD_trajectories = utils.removeErrorTrajectoryFromList( all_OD_trajectories) writeToCSV.saveData(all_OD_trajectories, root_folder + "/all_OD_trajectories_with_1D_data") else: all_OD_trajectories = writeToCSV.loadData( root_folder + "/all_OD_trajectories_with_1D_data.npz") """convert Lat, Lon to XY for displaying""" all_OD_trajectories_XY = trajectory_modeller.convertListOfTrajectoriesToXY( utils.CENTER_LAT_SG, utils.CENTER_LON_SG, all_OD_trajectories) plotter.plotListOfTrajectories(\ all_OD_trajectories_XY, \ show = True, \ clean = True, \ save = False, \ fname = "out_sample_tanker_all_OD_trajectories", path = utils.queryPath(root_folder + "/plots")) """Test distance to cluster centroids""" centroids_XY = trajectory_modeller.convertListOfTrajectoriesToXY(\ utils.CENTER_LAT_SG, utils.CENTER_LON_SG, centroids) for i in range(0, len(all_OD_trajectories_XY)): this_tr_XY = all_OD_trajectories_XY[i] if (metric_to_use == 1): this_tr_centroids_dist, according_pattern_index = minDistanceAgainstCentroids(\ this_tr_XY, centroids_XY, clustering_worker.trajectoryDissimilarityL2) print "augmented trajectories[{i}]".format(i = i), \ "'s best l2 distance is against cluster centroids[{i}], = ".format(i = according_pattern_index), \ this_tr_centroids_dist, ", max allowed distance = ", 1000 elif (metric_to_use == 2): this_tr_centroids_dist, according_pattern_index = minDistanceAgainstCentroids(\ this_tr_XY, centroids_XY, clustering_worker.trajectoryDissimilarityCenterMass) print "augmented trajectories[{i}]".format(i = i), \ "'s best center of mass distance is against cluster centroids[{i}], = ".format(i = according_pattern_index), \ this_tr_centroids_dist, ", max allowed distance = ", 1.5 # plotter.plotFeatureSpace(centroids[according_pattern_index]) # plotter.plotFeatureSpace(\ # trajectory_modeller.convertListOfTrajectoriesToLatLon(utils.CENTER_LAT_SG, utils.CENTER_LON_SG, [this_tr_XY])[0]) return
def executeClustering(root_folder, all_OD_trajectories_XY, reference_lat, reference_lon, filenames): fname = "{root_folder}_dissimilarity_l2_cophenetic_distance".format(root_folder = root_folder) # fname = "10_tankers_dissimilarity_l2_inconsistent_refined_endpoints" # fname = "10_tankers_dissimilarity_l2_cophenetic_distance_refined_endpoints" # fname = "10_tankers_dissimilarity_center_mass_cophenetic_distance_refined_endpoints" # fname = "10_tankers_dissimilarity_l2_inconsistent" # fname = "10_tankers_dissimilarity_l2_all_K" # fname = "10_tankers_dissimilarity_center_mass" # fname = "10_tankers_dissimilarity_center_mass_cophenetic_distance_cleaned" # fname = "10_tankers_dissimilarity_center_mass_inconsistent_cleaned" opt_cluster_label , cluster_labels, CH_indexes = clustering_worker.clusterTrajectories( \ trajectories = all_OD_trajectories_XY, \ fname = fname, \ path = utils.queryPath("tankers/cluster_result/{folder}".format(folder = fname)), \ metric_func = clustering_worker.trajectoryDissimilarityL2, \ # metric_func = clustering_worker.trajectoryDissimilarityCenterMass, \ # user_distance_matrix = writeToCSV.loadData(root_folder + \ # "/cluster_result/10_tankers_dissimilarity_center_mass/10_tankers_dissimilarity_center_mass_cleaned.npz"), \ # user_distance_matrix = writeToCSV.loadData(root_folder + \ # "/cluster_result/10_tankers_dissimilarity_l2_cophenetic_distance_cleaned/10_tankers_dissimilarity_l2_cophenetic_distance_cleaned.npz"), \ # user_distance_matrix = writeToCSV.loadData(root_folder + \ # "/cluster_result/10_tankers_dissimilarity_l2_cophenetic_distance_refined_endpoints" + \ # "/10_tankers_dissimilarity_l2_cophenetic_distance_refined_endpoints.npz"), \ # user_distance_matrix = writeToCSV.loadData(root_folder + \ # "/cluster_result/10_tankers_dissimilarity_center_mass_cophenetic_distance_refined_endpoints" + \ # "/10_tankers_dissimilarity_center_mass_cophenetic_distance_refined_endpoints.npz"), \ criterion = 'distance') print "opt_cluster_label:", opt_cluster_label print "opt_num_cluster:", len(set(opt_cluster_label)) # print "distance between 1 and 4, should be quite small:", clustering_worker.trajectoryDissimilarityL2( \ # all_OD_trajectories_XY[1], all_OD_trajectories_XY[4]) # print "distance between 0 and 4, should be quite large:", clustering_worker.trajectoryDissimilarityL2( \ # all_OD_trajectories_XY[0], all_OD_trajectories_XY[4]) # print "center of mass measure distance between 1 and 4, should be quite small:", clustering_worker.trajectoryDissimilarityCenterMass( \ # all_OD_trajectories_XY[1], all_OD_trajectories_XY[4]) # print "center of mass measure distance between 0 and 4, should be quite large:", clustering_worker.trajectoryDissimilarityCenterMass( \ # all_OD_trajectories_XY[0], all_OD_trajectories_XY[4]) # print "matrix:\n", clustering_worker.getTrajectoryDistanceMatrix(\ # all_OD_trajectories_XY, \ # metric_func = clustering_worker.trajectoryDissimilarityL2) # plotter.plotListOfTrajectories(all_OD_trajectories_XY, show = True, clean = True, save = False, fname = "") """Construct the endpoints to representative trajectory mapping""" endpoints = None for filename in filenames: this_vessel_endpoints = writeToCSV.readDataFromCSVWithMMSI( \ root_folder + "/endpoints", \ "{filename}_endpoints.csv".format(filename = filename[:filename.find(".")])) # Append to the total end points if(endpoints is None): endpoints = this_vessel_endpoints else: endpoints = np.concatenate((endpoints, this_vessel_endpoints), axis=0) cluster_centroids = clustering_worker.getClusterCentroids(opt_cluster_label, all_OD_trajectories_XY) cluster_centroids_lat_lon = {} # [cluster_label : centroid] dictionary for cluster_label, centroid in cluster_centroids.iteritems(): cluster_centroids_lat_lon[cluster_label] = convertListOfTrajectoriesToLatLon(reference_lat, reference_lon, \ [copy.deepcopy(centroid)])[0] # writeToCSV.writeDataToCSV(np.asarray(cluster_centroids_lat_lon[cluster_label]), root_folder + "/cleanedData/DEBUGGING", \ # "refined_centroid_{i}".format(i = cluster_label)) # flatten cluster_centroids_lat_lon_flattened = [point for cluster_label, centroid in cluster_centroids_lat_lon.iteritems() \ for point in centroid] writeToCSV.writeDataToCSV(np.asarray(cluster_centroids_lat_lon_flattened), root_folder + "/cleanedData", \ "centroids_" + fname) """array of centroids written to .npz""" writeToCSV.saveData([centroid for cluster_label, centroid in cluster_centroids_lat_lon.iteritems()], \ root_folder + "/cleanedData/centroids_arr") # raise ValueError("purpose stop for clusering only") """DEBUGGING,using unrefined data""" # point_to_examine = (1.2625833, 103.6827) # point_to_examine_XY = utils.LatLonToXY(reference_lat,reference_lon,point_to_examine[0], point_to_examine[1]) # augmented_trajectories_from_point_to_examine_index = [] # augmented_trajectories_from_point_to_examine = [] # for i in range(0, len(all_OD_trajectories_XY)): # trajectory = all_OD_trajectories_XY[i] # if (np.linalg.norm([ \ # point_to_examine_XY[0] - trajectory[0][utils.data_dict_x_y_coordinate["x"]], \ # point_to_examine_XY[1] - trajectory[0][utils.data_dict_x_y_coordinate["y"]]], 2) < utils.NEIGHBOURHOOD_ENDPOINT): # augmented_trajectories_from_point_to_examine_index.append(i) # augmented_trajectories_from_point_to_examine.append(trajectory) # print "augmented_trajectories_from_point_to_examine_index:", augmented_trajectories_from_point_to_examine_index, \ # "starting pos:", trajectory[0][utils.data_dict_x_y_coordinate["x"]], trajectory[0][utils.data_dict_x_y_coordinate["y"]] # print "augmented_trajectories_from_point_to_examine_index:", augmented_trajectories_from_point_to_examine_index # augmented_trajectories_from_point_to_examine = convertListOfTrajectoriesToLatLon(reference_lat, reference_lon, copy.deepcopy(augmented_trajectories_from_point_to_examine)) # for t in range(0, len(augmented_trajectories_from_point_to_examine)): # writeToCSV.writeDataToCSV(np.asarray(augmented_trajectories_from_point_to_examine[t]), root_folder + "/cleanedData/DEBUGGING", \ # "DEBUGGING_augmented_{t}".format(t = augmented_trajectories_from_point_to_examine_index[t])) # augmented_trajectories_from_point_to_examine_clusters = [] # for i in augmented_trajectories_from_point_to_examine_index: # augmented_trajectories_from_point_to_examine_clusters.append(opt_cluster_label[i]) # augmented_trajectories_from_point_to_examine_clusters_unique = list(set(augmented_trajectories_from_point_to_examine_clusters)) # class_trajectories_dict = clustering_worker.formClassTrajectoriesDict(opt_cluster_label, all_OD_trajectories_XY) # for i in augmented_trajectories_from_point_to_examine_clusters_unique: # writeToCSV.writeDataToCSV(np.asarray(cluster_centroids_lat_lon[i]), root_folder + "/cleanedData/DEBUGGING", \ # "DEBUGGING_centroid_{i}".format(i = i)) # print "cluster_centroids[{i}], starting point:".format(i = i), cluster_centroids[i][0] # """save all trajectories under this cluster i """ # class_trajectories = class_trajectories_dict[i] # class_trajectories_lat_lon = convertListOfTrajectoriesToLatLon(reference_lat, reference_lon, copy.deepcopy(class_trajectories)) # for j in range(0, len(class_trajectories_lat_lon)): # print "class_trajectories[{i}], starting point:".format(i = i), class_trajectories[j][0] # writeToCSV.writeDataToCSV(np.asarray(class_trajectories_lat_lon[j]), \ # utils.queryPath(root_folder + "/cleanedData/DEBUGGING/CLASS{i}".format(i = i)) , \ # "DEBUGGING_class_{i}_trajectory_{j}".format(i = i , j = j)) """END DEBUGGING""" endpoints_cluster_dict = endPointsToRepresentativeTrajectoryMapping(\ endpoints, \ all_OD_trajectories_XY , \ opt_cluster_label, \ reference_lat, \ reference_lon) empty_endpoints = [] augmented_index_to_extra_label_mapping = {} # mapping from normal index to appended index in all_protocol_trajectories cluster_label_to_cluster_size = {} # 'cluster size' of the appended augmented trajectory in all_protocol_trajectories all_protocol_trajectories = [] # indexed by cluster label (offset by 1, cluster 1 -> all_protocol_trajectories[0]) for label in range(np.min(opt_cluster_label), np.max(opt_cluster_label) + 1): assert (label in cluster_centroids_lat_lon), "{label} is supposed to be in the cluster_centroids_lat_lon dict".format(label = label) all_protocol_trajectories.append(cluster_centroids_lat_lon[label]) cluster_label_to_cluster_size[label - 1] = len(np.where(opt_cluster_label == label)[0]) assert(np.sum([size for label, size in cluster_label_to_cluster_size.iteritems()]) == len(opt_cluster_label)), "sum of individual label size should == total count" """ assign augmented trajectories to empty endpoints: True/False """ assign_augmented_to_empty_enpoints_flag = False DEBUG_APPEND_INDEXS = [] if (assign_augmented_to_empty_enpoints_flag): for endpoint_str, endpoint_tuple_list in endpoints_cluster_dict.iteritems(): endpoint_starting_clusters = [item.cluster for item in endpoint_tuple_list] # get the list of cluster_labels of centroids to a certain endpoint if (len(endpoint_starting_clusters) == 0): """If no centroid assigned, then assign the original augmented trajectory""" this_empty_endpoint = lookForEndPoints(endpoints, endpoint_str) # endpoints is in lat, lon if (this_empty_endpoint is None): raise ValueError("Error! should always be able to map back endpoints, but {p} is not found".format(p = endpoint_str)) empty_endpoints.append(this_empty_endpoint) point_to_examine_XY = utils.LatLonToXY(reference_lat,reference_lon, \ this_empty_endpoint[utils.dataDict["latitude"]], this_empty_endpoint[utils.dataDict["longitude"]]) augmented_trajectories_from_point_to_examine_index = [] augmented_trajectories_from_point_to_examine = [] for i in range(0, len(all_OD_trajectories_XY)): trajectory = all_OD_trajectories_XY[i] if (np.linalg.norm([ \ point_to_examine_XY[0] - trajectory[0][utils.data_dict_x_y_coordinate["x"]], \ point_to_examine_XY[1] - trajectory[0][utils.data_dict_x_y_coordinate["y"]]], 2) < utils.NEIGHBOURHOOD_ENDPOINT): augmented_trajectories_from_point_to_examine_index.append(i) augmented_trajectories_from_point_to_examine.append(trajectory) # print "this found augmented_trajectories_from_point_to_examine_index:", \ # augmented_trajectories_from_point_to_examine_index, \ # "starting pos:", \ # trajectory[0][utils.data_dict_x_y_coordinate["x"]], \ # trajectory[0][utils.data_dict_x_y_coordinate["y"]] print "all indexes (w.r.t all_OD_trajectories_XY) for this_empty_endpoint:", augmented_trajectories_from_point_to_examine_index DEBUG_APPEND_INDEXS.append(augmented_trajectories_from_point_to_examine_index) """Append augmented_trajectories_from_point_to_examine to end of array of centroids and give extra label""" for augmented_index in augmented_trajectories_from_point_to_examine_index: if (not augmented_index in augmented_index_to_extra_label_mapping): # if this normal trajectory is not appened, append it and mark in the augmented_index_to_extra_label_mapping augmented_index_to_extra_label_mapping[augmented_index] = len(all_protocol_trajectories) cluster_label_to_cluster_size[augmented_index_to_extra_label_mapping[augmented_index]] = 1 all_protocol_trajectories.append(\ convertListOfTrajectoriesToLatLon(reference_lat, reference_lon, \ [copy.deepcopy(all_OD_trajectories_XY[augmented_index])])[0]) else: cluster_label_to_cluster_size[augmented_index_to_extra_label_mapping[augmented_index]] += 1 endpoints_cluster_dict[endpoint_str].append(utils.ClusterCentroidTuple(\ cluster = augmented_index_to_extra_label_mapping[augmented_index], \ centroid = all_protocol_trajectories[augmented_index_to_extra_label_mapping[augmented_index]])) """Asserting and Saving of info for Agent Based Simulator""" assert (len(set([index for index_list in DEBUG_APPEND_INDEXS for index in index_list])) == \ len(all_protocol_trajectories) - len(set(opt_cluster_label))), \ "size of appended augmented trajectories should == len(DEBUG_APPEND_INDEXS)" for index in range(0, len(all_protocol_trajectories)): assert(index in cluster_label_to_cluster_size), "all_protocol_trajectories's index mapping to cluster should be complete" for label, size in cluster_label_to_cluster_size.iteritems(): print "label, size:", label, size print "number of endpoints that do not have clusters assigned to:", len(empty_endpoints) print "total number of endpoints:", len(endpoints) writeToCSV.writeDataToCSVWithMMSI(np.asarray(endpoints), root_folder + "/endpoints", "all_endpoints_with_MMSI") writeToCSV.writeDataToCSV(np.asarray(empty_endpoints), root_folder + "/cleanedData", \ "non_starting_endpoints_{root_folder}_dissimilarity_l2_cophenetic_distance_cleaned".format(root_folder = root_folder)) writeToCSV.saveData([endpoints_cluster_dict], \ filename = root_folder + "/cleanedData" + "/endpoints_cluster_dict" + fname) """write all the all_protocol_trajectories for DEBUGGING purpose""" for i in range(0, len(all_protocol_trajectories)): protocol_trajectory = all_protocol_trajectories[i] writeToCSV.writeDataToCSV(\ np.asarray(protocol_trajectory), \ utils.queryPath(root_folder + "/cleanedData/DEBUGGING/ALL_PROTOCOLS_PATTERN_ONLY"), \ "all_protocol_{i}".format(i = i)) """Save related csv files for Agent Based Simulator""" writeToCSV.writeAllProtocolTrajectories(\ path = utils.queryPath(root_folder+"LearningResult"), \ file_name = "protocol_trajectories_with_cluster_size", \ all_protocol_trajectories = all_protocol_trajectories, \ cluster_label_to_cluster_size = cluster_label_to_cluster_size) writeToCSV.writeEndPointsToProtocolTrajectoriesIndexesWithMMSI(\ path = utils.queryPath(root_folder+"LearningResult"), \ file_name = "endpoints_to_protocol_trajectories", \ endpoints = endpoints, \ endpoints_cluster_dict = endpoints_cluster_dict)
def main(): metric_to_use = int(raw_input("use metric?\n" + "1. l2\n" + "2. center of mass\n")) root_folder = "tankers/out_sample_test" """read centroids""" centroids = None if metric_to_use == 1: centroids = writeToCSV.loadData("tankers/cleanedData/centroids_arr_l2.npz") elif metric_to_use == 2: centroids = writeToCSV.loadData("tankers/cleanedData/centroids_arr_center_mass.npz") """Extract endpoints, trajectories, augmentation""" filenames = [ "9050462.csv", "9259769.csv", "9327138.csv", "9408475.csv", "9417464.csv", "9548440.csv", ] # for out sample test # filenames = ["9408475.csv"] endpoints = None all_OD_trajectories = [] """Do the augmentation if not yet done""" if not os.path.exists(root_folder + "/all_OD_trajectories_with_1D_data.npz"): for i in range(0, len(filenames)): this_vessel_trajectory_points = writeToCSV.readDataFromCSV(root_folder + "/cleanedData", filenames[i]) # Extract end points, along with MMSI this_vessel_endpoints = np.asarray( trajectory_modeller.extractEndPoints( writeToCSV.readDataFromCSVWithMMSI(root_folder + "/cleanedData", filenames[i]) ) ) # Save end points, along with MMSI writeToCSV.writeDataToCSVWithMMSI( this_vessel_endpoints, utils.queryPath(root_folder + "/endpoints"), "{filename}_endpoints".format(filename=filenames[i][: filenames[i].find(".")]), ) print "this_vessel_endpoints.shape:", this_vessel_endpoints.shape # Append to the total end points if endpoints is None: endpoints = this_vessel_endpoints else: endpoints = np.concatenate((endpoints, this_vessel_endpoints), axis=0) for s in range(0, len(this_vessel_endpoints) - 1): originLatitude = this_vessel_endpoints[s][utils.dataDict["latitude"]] originLongtitude = this_vessel_endpoints[s][utils.dataDict["longitude"]] origin_ts = this_vessel_endpoints[s][utils.dataDict["ts"]] endLatitude = this_vessel_endpoints[s + 1][utils.dataDict["latitude"]] endLongtitude = this_vessel_endpoints[s + 1][utils.dataDict["longitude"]] end_ts = this_vessel_endpoints[s + 1][utils.dataDict["ts"]] """Extracting trajectory between a pair of OD""" print "\n\nextracting endpoints between ", s, " and ", s + 1 OD_trajectories, OD_trajectories_lat_lon = trajectory_modeller.extractTrajectoriesUntilOD( this_vessel_trajectory_points, origin_ts, originLatitude, originLongtitude, end_ts, endLatitude, endLongtitude, show=False, save=True, clean=False, fname=filenames[i][: filenames[i].find(".")] + "_trajectory_between_endpoint{s}_and{e}".format(s=s, e=s + 1), path=utils.queryPath(root_folder + "/plots"), ) # there will be one trajectory between each OD assert len(OD_trajectories) > 0, "OD_trajectories extracted must have length > 0" print "number of trajectory points extracted : ", len(OD_trajectories[0]) if ( len(OD_trajectories[0]) > 2 ): # more than just the origin and destination endpoints along the trajectory writeToCSV.writeDataToCSV( data=OD_trajectories_lat_lon[0], path=utils.queryPath(root_folder + "/trajectories"), file_name="{filename}_trajectory_endpoint_{s}_to_{e}".format( filename=filenames[i][: filenames[i].find(".")], s=s, e=s + 1 ), ) """ Interpolation based on pure geographical trajectory, ignore temporal information """ interpolated_OD_trajectories = interpolator.geographicalTrajetoryInterpolation(OD_trajectories) plotter.plotListOfTrajectories( interpolated_OD_trajectories, show=False, clean=True, save=True, fname=filenames[i][: filenames[i].find(".")] + "_interpolated_algo_3_between_endpoint{s}_and{e}".format(s=s, e=s + 1), path=utils.queryPath(root_folder + "/plots"), ) """ Interpolation of 1D data: speed, rate_of_turn, etc; interpolated_OD_trajectories / OD_trajectories are both in X, Y coordinates """ if len(interpolated_OD_trajectories) > 0: interpolated_OD_trajectories[0] = interpolator.interpolate1DFeatures( interpolated_OD_trajectories[0], OD_trajectories[0] ) # change X, Y coordinate to Lat, Lon interpolated_OD_trajectories_lat_lon = trajectory_modeller.convertListOfTrajectoriesToLatLon( originLatitude, originLongtitude, interpolated_OD_trajectories ) if len(interpolated_OD_trajectories_lat_lon) > 0: # since there should be only one trajectory between each pair of OD all_OD_trajectories.append(interpolated_OD_trajectories_lat_lon[0]) else: print "no trajectories extracted between endpoints ", s, " and ", s + 1 plt.clf() assert not endpoints is None, "Error!: No endpoints extracted from the historial data of vessels" + "_".join( filenames ) print "Final endpoints.shape:", endpoints.shape print "number of interpolated all_OD_trajectories:", len(all_OD_trajectories) all_OD_trajectories = utils.removeErrorTrajectoryFromList(all_OD_trajectories) writeToCSV.saveData(all_OD_trajectories, root_folder + "/all_OD_trajectories_with_1D_data") else: all_OD_trajectories = writeToCSV.loadData(root_folder + "/all_OD_trajectories_with_1D_data.npz") """convert Lat, Lon to XY for displaying""" all_OD_trajectories_XY = trajectory_modeller.convertListOfTrajectoriesToXY( utils.CENTER_LAT_SG, utils.CENTER_LON_SG, all_OD_trajectories ) plotter.plotListOfTrajectories( all_OD_trajectories_XY, show=True, clean=True, save=False, fname="out_sample_tanker_all_OD_trajectories", path=utils.queryPath(root_folder + "/plots"), ) """Test distance to cluster centroids""" centroids_XY = trajectory_modeller.convertListOfTrajectoriesToXY( utils.CENTER_LAT_SG, utils.CENTER_LON_SG, centroids ) for i in range(0, len(all_OD_trajectories_XY)): this_tr_XY = all_OD_trajectories_XY[i] if metric_to_use == 1: this_tr_centroids_dist, according_pattern_index = minDistanceAgainstCentroids( this_tr_XY, centroids_XY, clustering_worker.trajectoryDissimilarityL2 ) print "augmented trajectories[{i}]".format( i=i ), "'s best l2 distance is against cluster centroids[{i}], = ".format( i=according_pattern_index ), this_tr_centroids_dist, ", max allowed distance = ", 1000 elif metric_to_use == 2: this_tr_centroids_dist, according_pattern_index = minDistanceAgainstCentroids( this_tr_XY, centroids_XY, clustering_worker.trajectoryDissimilarityCenterMass ) print "augmented trajectories[{i}]".format( i=i ), "'s best center of mass distance is against cluster centroids[{i}], = ".format( i=according_pattern_index ), this_tr_centroids_dist, ", max allowed distance = ", 1.5 # plotter.plotFeatureSpace(centroids[according_pattern_index]) # plotter.plotFeatureSpace(\ # trajectory_modeller.convertListOfTrajectoriesToLatLon(utils.CENTER_LAT_SG, utils.CENTER_LON_SG, [this_tr_XY])[0]) return