def main():
	root_folder = raw_input("Input the root_folder name:")
	
	"""
	Firstly, extract all .csv input file names from {root_folder}/input/*.csv
	"""
	# filenames = ["8514019.csv", "9116943.csv", "9267118.csv", "9443140.csv", "9383986.csv", "9343340.csv", "9417464.csv", "9664225.csv", "9538440.csv", "9327138.csv"]
	# filenames = ["9664225.csv"]
	# filenames = ["8514019.csv"]
	filenames = []
	for input_filename in os.listdir("{root_folder}/input/".format(root_folder = root_folder)):
		if (input_filename.find(".csv") != -1):
			filenames.append(input_filename)

	"""
	Get min distance between vessels
	"""
	need_compute_mindistance = raw_input("Need to compute min_distance_matrix for vessel interaction? (y/n) :") == 'y'
	if (need_compute_mindistance):
		"""sort the aggregateData with MMSI based on TS"""
		data_with_mmsi = writeToCSV.readDataFromCSVWithMMSI(path = root_folder + "/cleanedData", filename = "aggregateData_with_mmsi.csv")
		data_with_mmsi_sorted = compute_mindistance.sortDataBasedOnTS(data_with_mmsi)
		writeToCSV.writeDataToCSVWithMMSI(data_with_mmsi_sorted, root_folder + "/cleanedData", "aggregateData_with_mmsi_sorted")

		"""Apply the computing of min distance using a timed window"""
		data_with_mmsi_sorted = writeToCSV.readDataFromCSVWithMMSI(path = root_folder + "/cleanedData", filename = "aggregateData_with_mmsi_sorted.csv")
		mmsi_set = compute_mindistance.getSetOfMMSI(data_with_mmsi_sorted)
		print mmsi_set
		print list(mmsi_set)

		start_time = time.time()
		mmsi_list_dict, min_distance_matrix, vessel_distance_speed_dict = \
		compute_mindistance.computeVesselMinDistanceMatrix(data_with_mmsi_sorted, TIME_WINDOW = 1800)

		writeToCSV.saveData([{ \
			'mmsi_list_dict': mmsi_list_dict, \
			'min_distance_matrix': min_distance_matrix, \
			'vessel_distance_speed_dict': vessel_distance_speed_dict
			}], filename = root_folder + "/cleanedData" + "/min_distance_matrix_with_mmsi_time_window_1800_sec")

		print "time spent:", time.time() - start_time

		"""From already computed"""	
		# min_distance_matrix_result = writeToCSV.loadData(\
		# 	root_folder + "/cleanedData" + "/min_distance_matrix_with_mmsi_time_window_1800_sec.npz")
		# print "min_distance_matrix_result type:\n", type(min_distance_matrix_result)
		# mmsi_list_dict = min_distance_matrix_result[0]["mmsi_list_dict"]
		# min_distance_matrix = min_distance_matrix_result[0]["min_distance_matrix"]
		# vessel_distance_speed_dict = min_distance_matrix_result[0]["vessel_distance_speed_dict"]
		# print "min_distance_matrix loaded:\n", min_distance_matrix
		# min_of_min_distance = sys.maxint
		# for i in range(0, min_distance_matrix.shape[0]):
		# 	for j in range(i + 1, min_distance_matrix.shape[1]):
		# 		if (min_distance_matrix[i][j] < min_of_min_distance):
		# 			min_of_min_distance = min_distance_matrix[i][j]
		# print "min_distance_matrix min of 10 tankers:", min_of_min_distance

		"""write min distance records for Agent Based Simulator"""
		writeToCSV.writeVesselSpeedToDistance(\
			path = utils.queryPath(root_folder+"LearningResult"),\
			file_name = "vessel_speed_to_distance", \
			vessel_distance_speed_dict = vessel_distance_speed_dict)
		writeToCSV.writeVesselMinDistanceMatrix(\
			path = utils.queryPath(root_folder+"LearningResult"), \
			file_name = "vessel_min_distance_matrix", \
			mmsi_list_dict = mmsi_list_dict, \
			min_distance_matrix = min_distance_matrix)
		writeToCSV.writeMMSIs(\
			path = utils.queryPath(root_folder+"LearningResult"), \
			file_name = "mmsi_list", \
			mmsi_list = [key for key, index in mmsi_list_dict.iteritems()])

	"""
	Test Clustering
	"""
	# trajectories_to_cluster = writeToCSV.loadData(root_folder + "/" + "all_OD_trajectories_with_1D_data_refined.npz")
	# # trajectories_to_cluster = writeToCSV.loadData(root_folder + "/" + "all_OD_trajectories_cleaned.npz")
	# # trajectories_to_cluster = writeToCSV.loadData(root_folder + "/" + "all_OD_trajectories_9664225.npz")
	# print "trajectories_to_cluster.shape: ", trajectories_to_cluster.shape
	# print "type(trajectories_to_cluster): ", type(trajectories_to_cluster)
	# print "len(trajectories_to_cluster): ", len(trajectories_to_cluster)
	
	# # convert Lat, Lon to XY for clustering
	# all_OD_trajectories_XY = convertListOfTrajectoriesToXY(utils.CENTER_LAT_SG, utils.CENTER_LON_SG, trajectories_to_cluster)
	# executeClustering(root_folder = root_folder, \
	# 	all_OD_trajectories_XY = all_OD_trajectories_XY, \
	# 	reference_lat = utils.CENTER_LAT_SG, \
	# 	reference_lon = utils.CENTER_LON_SG, \
	#   filenames = filenames)
	# raise ValueError("purpose stop for testing clustering")


	"""
	plot out the value space of the features, speed, accelerations, etc, for the aggregateData
	"""
	# filename = "aggregateData.npz"
	# path = "tankers/cleanedData"
	# data = writeToCSV.loadArray("{p}/{f}".format(p = path, f=filename))
	# for trajectory in trajectories_to_cluster:
		# plotter.plotFeatureSpace(trajectory)
	# raise ValueError("For plotting feature space only")

	"""
	Read the cleaned .csv input files form {root_folder}/cleanedData/
	Extract endpoints
	"""
	endpoints = None
	all_OD_trajectories = []
	utils.queryPath("{root_folder}/endpoints".format(root_folder = root_folder))
	utils.queryPath("{root_folder}/trajectories".format(root_folder = root_folder))
	
	for i in range(0, len(filenames)):
		this_vessel_trajectory_points = writeToCSV.readDataFromCSV(root_folder + "/cleanedData", filenames[i])
		# Extract end points, along with MMSI
		this_vessel_endpoints = np.asarray(extractEndPoints(writeToCSV.readDataFromCSVWithMMSI(root_folder + "/cleanedData", filenames[i])))
		# Save end points, along with MMSI
		writeToCSV.writeDataToCSVWithMMSI( \
			this_vessel_endpoints, \
			root_folder + "/endpoints", \
			"{filename}_endpoints".format(filename = filenames[i][:filenames[i].find(".")]))
		print "this_vessel_endpoints.shape:", this_vessel_endpoints.shape

		# Append to the total end points
		if(endpoints is None):
			endpoints = this_vessel_endpoints
		else:
			endpoints = np.concatenate((endpoints, this_vessel_endpoints), axis=0)

		for s in range (0, len(this_vessel_endpoints) - 1):
			originLatitude = this_vessel_endpoints[s][utils.dataDict["latitude"]]
			originLongtitude = this_vessel_endpoints[s][utils.dataDict["longitude"]]
			origin_ts = this_vessel_endpoints[s][utils.dataDict["ts"]]

			endLatitude = this_vessel_endpoints[s + 1][utils.dataDict["latitude"]]
			endLongtitude = this_vessel_endpoints[s + 1][utils.dataDict["longitude"]]	
			end_ts = this_vessel_endpoints[s + 1][utils.dataDict["ts"]]
			
			"""Extracting trajectory between a pair of OD"""
			print "\n\nextracting endpoints between ", s, " and ", s + 1
			OD_trajectories, OD_trajectories_lat_lon = extractTrajectoriesUntilOD(\
				this_vessel_trajectory_points, \
				origin_ts, \
				originLatitude, \
				originLongtitude, \
				end_ts, \
				endLatitude, \
				endLongtitude, \
				show = False, save = True, clean = False, \
				fname = filenames[i][:filenames[i].find(".")] + "_trajectory_between_endpoint{s}_and{e}".format(s = s, e = s + 1))
				# there will be one trajectory between each OD		
			assert (len(OD_trajectories) > 0), "OD_trajectories extracted must have length > 0"
			print "number of trajectory points extracted : ", len(OD_trajectories[0])

			if(len(OD_trajectories[0]) > 2): # more than just the origin and destination endpoints along the trajectory
				writeToCSV.writeDataToCSV( \
					data = OD_trajectories_lat_lon[0],
					path = root_folder + "/trajectories", \
					file_name = "{filename}_trajectory_endpoint_{s}_to_{e}".format(filename = filenames[i][:filenames[i].find(".")], \
						s = s, \
						e = s + 1))
				"""
				Interpolation based on pure geographical trajectory, ignore temporal information
				"""
				interpolated_OD_trajectories = interpolator.geographicalTrajetoryInterpolation(OD_trajectories)
				plotter.plotListOfTrajectories( \
					interpolated_OD_trajectories, \
					show = False, \
					clean = True, \
					save = True, \
					fname = filenames[i][:filenames[i].find(".")] + "_interpolated_algo_3_between_endpoint{s}_and{e}".format(\
						s = s, \
						e = s + 1))
				
				"""
				Interpolation of 1D data: speed, rate_of_turn, etc; interpolated_OD_trajectories / OD_trajectories are both in X, Y coordinates
				"""
				if(len(interpolated_OD_trajectories) > 0):
					interpolated_OD_trajectories[0] = interpolator.interpolate1DFeatures( \
						interpolated_OD_trajectories[0], \
						OD_trajectories[0])

				# change X, Y coordinate to Lat, Lon
				interpolated_OD_trajectories_lat_lon = convertListOfTrajectoriesToLatLon( \
					originLatitude, originLongtitude, interpolated_OD_trajectories)
				if(len(interpolated_OD_trajectories_lat_lon) > 0):
					# since there should be only one trajectory between each pair of OD
					all_OD_trajectories.append(interpolated_OD_trajectories_lat_lon[0])
			else:
				print "no trajectories extracted between endpoints ", s , " and ", s + 1
				plt.clf()

	assert (not endpoints is None), "Error!: No endpoints extracted from the historial data of vessels" + "_".join(filenames)
	print "Final endpoints.shape:", endpoints.shape
	print "number of interpolated all_OD_trajectories:", len(all_OD_trajectories)

	"""
	save the augmented trajectories between endpoints as npz data file and the plot
	"""
	# remove error trajectories that are too far from Singapore
	all_OD_trajectories = utils.removeErrorTrajectoryFromList(all_OD_trajectories)
	writeToCSV.saveData(all_OD_trajectories, root_folder + "/all_OD_trajectories_with_1D_data")
	# convert Lat, Lon to XY for displaying
	all_OD_trajectories_XY = convertListOfTrajectoriesToXY(utils.CENTER_LAT_SG, utils.CENTER_LON_SG, all_OD_trajectories)
	plotter.plotListOfTrajectories(all_OD_trajectories_XY, show = False, clean = True, save = True, \
		fname = "{root_folder}_all_OD_trajectories".format(root_folder = root_folder))


	"""
	Execute Clustering
	"""
	executeClustering(root_folder = root_folder, \
		all_OD_trajectories_XY = all_OD_trajectories_XY, \
		reference_lat = utils.CENTER_LAT_SG, \
		reference_lon = utils.CENTER_LON_SG, \
		filenames = filenames)
Ejemplo n.º 2
0
def main():
    metric_to_use = int(
        raw_input("use metric?\n" + "1. l2\n" + "2. center of mass\n"))
    root_folder = "tankers/out_sample_test"
    """read centroids"""
    centroids = None
    if (metric_to_use == 1):
        centroids = writeToCSV.loadData(
            "tankers/cleanedData/centroids_arr_l2.npz")
    elif (metric_to_use == 2):
        centroids = writeToCSV.loadData(
            "tankers/cleanedData/centroids_arr_center_mass.npz")
    """Extract endpoints, trajectories, augmentation"""
    filenames = [
        "9050462.csv", "9259769.csv", "9327138.csv", "9408475.csv",
        "9417464.csv", "9548440.csv"
    ]  # for out sample test
    # filenames = ["9408475.csv"]
    endpoints = None
    all_OD_trajectories = []
    """Do the augmentation if not yet done"""
    if (not os.path.exists(root_folder +
                           "/all_OD_trajectories_with_1D_data.npz")):
        for i in range(0, len(filenames)):
            this_vessel_trajectory_points = writeToCSV.readDataFromCSV(
                root_folder + "/cleanedData", filenames[i])
            # Extract end points, along with MMSI
            this_vessel_endpoints = np.asarray(
                trajectory_modeller.extractEndPoints(
                    writeToCSV.readDataFromCSVWithMMSI(
                        root_folder + "/cleanedData", filenames[i])))
            # Save end points, along with MMSI
            writeToCSV.writeDataToCSVWithMMSI( \
             this_vessel_endpoints, \
             utils.queryPath(root_folder + "/endpoints"), \
             "{filename}_endpoints".format(filename = filenames[i][:filenames[i].find(".")]))
            print "this_vessel_endpoints.shape:", this_vessel_endpoints.shape

            # Append to the total end points
            if (endpoints is None):
                endpoints = this_vessel_endpoints
            else:
                endpoints = np.concatenate((endpoints, this_vessel_endpoints),
                                           axis=0)

            for s in range(0, len(this_vessel_endpoints) - 1):
                originLatitude = this_vessel_endpoints[s][
                    utils.dataDict["latitude"]]
                originLongtitude = this_vessel_endpoints[s][
                    utils.dataDict["longitude"]]
                origin_ts = this_vessel_endpoints[s][utils.dataDict["ts"]]

                endLatitude = this_vessel_endpoints[s + 1][
                    utils.dataDict["latitude"]]
                endLongtitude = this_vessel_endpoints[s + 1][
                    utils.dataDict["longitude"]]
                end_ts = this_vessel_endpoints[s + 1][utils.dataDict["ts"]]
                """Extracting trajectory between a pair of OD"""
                print "\n\nextracting endpoints between ", s, " and ", s + 1
                OD_trajectories, OD_trajectories_lat_lon = trajectory_modeller.extractTrajectoriesUntilOD(\
                 this_vessel_trajectory_points, \
                 origin_ts, \
                 originLatitude, \
                 originLongtitude, \
                 end_ts, \
                 endLatitude, \
                 endLongtitude, \
                 show = False, save = True, clean = False, \
                 fname = filenames[i][:filenames[i].find(".")] + "_trajectory_between_endpoint{s}_and{e}".format(s = s, e = s + 1), \
                 path = utils.queryPath(root_folder + "/plots"))
                # there will be one trajectory between each OD
                assert (len(OD_trajectories) >
                        0), "OD_trajectories extracted must have length > 0"
                print "number of trajectory points extracted : ", len(
                    OD_trajectories[0])

                if (
                        len(OD_trajectories[0]) > 2
                ):  # more than just the origin and destination endpoints along the trajectory
                    writeToCSV.writeDataToCSV( \
                     data = OD_trajectories_lat_lon[0],
                     path = utils.queryPath(root_folder + "/trajectories"), \
                     file_name = "{filename}_trajectory_endpoint_{s}_to_{e}".format(filename = filenames[i][:filenames[i].find(".")], \
                      s = s, \
                      e = s + 1))
                    """
					Interpolation based on pure geographical trajectory, ignore temporal information
					"""
                    interpolated_OD_trajectories = interpolator.geographicalTrajetoryInterpolation(
                        OD_trajectories)
                    plotter.plotListOfTrajectories( \
                     interpolated_OD_trajectories, \
                     show = False, \
                     clean = True, \
                     save = True, \
                     fname = filenames[i][:filenames[i].find(".")] + "_interpolated_algo_3_between_endpoint{s}_and{e}".format(\
                      s = s, \
                      e = s + 1), \
                     path = utils.queryPath(root_folder + "/plots"))
                    """
					Interpolation of 1D data: speed, rate_of_turn, etc; interpolated_OD_trajectories / OD_trajectories are both in X, Y coordinates
					"""
                    if (len(interpolated_OD_trajectories) > 0):
                        interpolated_OD_trajectories[0] = interpolator.interpolate1DFeatures( \
                         interpolated_OD_trajectories[0], \
                         OD_trajectories[0])

                    # change X, Y coordinate to Lat, Lon
                    interpolated_OD_trajectories_lat_lon = trajectory_modeller.convertListOfTrajectoriesToLatLon( \
                     originLatitude, originLongtitude, interpolated_OD_trajectories)
                    if (len(interpolated_OD_trajectories_lat_lon) > 0):
                        # since there should be only one trajectory between each pair of OD
                        all_OD_trajectories.append(
                            interpolated_OD_trajectories_lat_lon[0])
                else:
                    print "no trajectories extracted between endpoints ", s, " and ", s + 1
                    plt.clf()

        assert (
            not endpoints is None
        ), "Error!: No endpoints extracted from the historial data of vessels" + "_".join(
            filenames)
        print "Final endpoints.shape:", endpoints.shape
        print "number of interpolated all_OD_trajectories:", len(
            all_OD_trajectories)
        all_OD_trajectories = utils.removeErrorTrajectoryFromList(
            all_OD_trajectories)
        writeToCSV.saveData(all_OD_trajectories,
                            root_folder + "/all_OD_trajectories_with_1D_data")
    else:
        all_OD_trajectories = writeToCSV.loadData(
            root_folder + "/all_OD_trajectories_with_1D_data.npz")
    """convert Lat, Lon to XY for displaying"""
    all_OD_trajectories_XY = trajectory_modeller.convertListOfTrajectoriesToXY(
        utils.CENTER_LAT_SG, utils.CENTER_LON_SG, all_OD_trajectories)
    plotter.plotListOfTrajectories(\
     all_OD_trajectories_XY, \
     show = True, \
     clean = True, \
     save = False, \
     fname = "out_sample_tanker_all_OD_trajectories", path = utils.queryPath(root_folder + "/plots"))
    """Test distance to cluster centroids"""
    centroids_XY = trajectory_modeller.convertListOfTrajectoriesToXY(\
     utils.CENTER_LAT_SG, utils.CENTER_LON_SG, centroids)

    for i in range(0, len(all_OD_trajectories_XY)):
        this_tr_XY = all_OD_trajectories_XY[i]
        if (metric_to_use == 1):
            this_tr_centroids_dist, according_pattern_index = minDistanceAgainstCentroids(\
             this_tr_XY, centroids_XY, clustering_worker.trajectoryDissimilarityL2)
            print "augmented trajectories[{i}]".format(i = i), \
            "'s best l2 distance is against cluster centroids[{i}], = ".format(i = according_pattern_index), \
            this_tr_centroids_dist, ", max allowed distance  = ", 1000
        elif (metric_to_use == 2):
            this_tr_centroids_dist, according_pattern_index = minDistanceAgainstCentroids(\
             this_tr_XY, centroids_XY, clustering_worker.trajectoryDissimilarityCenterMass)
            print "augmented trajectories[{i}]".format(i = i), \
            "'s best center of mass distance is against cluster centroids[{i}], = ".format(i = according_pattern_index), \
            this_tr_centroids_dist, ", max allowed distance  = ", 1.5

        # plotter.plotFeatureSpace(centroids[according_pattern_index])
        # plotter.plotFeatureSpace(\
        # trajectory_modeller.convertListOfTrajectoriesToLatLon(utils.CENTER_LAT_SG, utils.CENTER_LON_SG, [this_tr_XY])[0])

    return
def executeClustering(root_folder, all_OD_trajectories_XY, reference_lat, reference_lon, filenames):
	fname = "{root_folder}_dissimilarity_l2_cophenetic_distance".format(root_folder = root_folder)
	# fname = "10_tankers_dissimilarity_l2_inconsistent_refined_endpoints"
	# fname = "10_tankers_dissimilarity_l2_cophenetic_distance_refined_endpoints"
	# fname = "10_tankers_dissimilarity_center_mass_cophenetic_distance_refined_endpoints"

	# fname = "10_tankers_dissimilarity_l2_inconsistent"
	# fname = "10_tankers_dissimilarity_l2_all_K"
	# fname = "10_tankers_dissimilarity_center_mass"
	# fname = "10_tankers_dissimilarity_center_mass_cophenetic_distance_cleaned"
	# fname = "10_tankers_dissimilarity_center_mass_inconsistent_cleaned"
	

	opt_cluster_label , cluster_labels, CH_indexes = clustering_worker.clusterTrajectories( \
		trajectories  = all_OD_trajectories_XY, \
		fname = fname, \
		path = utils.queryPath("tankers/cluster_result/{folder}".format(folder = fname)), \
		metric_func = clustering_worker.trajectoryDissimilarityL2, \
		# metric_func = clustering_worker.trajectoryDissimilarityCenterMass, \
		
		# user_distance_matrix = writeToCSV.loadData(root_folder + \
			# "/cluster_result/10_tankers_dissimilarity_center_mass/10_tankers_dissimilarity_center_mass_cleaned.npz"), \

		# user_distance_matrix = writeToCSV.loadData(root_folder + \
			# "/cluster_result/10_tankers_dissimilarity_l2_cophenetic_distance_cleaned/10_tankers_dissimilarity_l2_cophenetic_distance_cleaned.npz"), \
		
		# user_distance_matrix = writeToCSV.loadData(root_folder + \
		# 	"/cluster_result/10_tankers_dissimilarity_l2_cophenetic_distance_refined_endpoints" + \
		# 	"/10_tankers_dissimilarity_l2_cophenetic_distance_refined_endpoints.npz"), \

		# user_distance_matrix = writeToCSV.loadData(root_folder + \
			# "/cluster_result/10_tankers_dissimilarity_center_mass_cophenetic_distance_refined_endpoints" + \
			# "/10_tankers_dissimilarity_center_mass_cophenetic_distance_refined_endpoints.npz"), \

		criterion = 'distance')

	print "opt_cluster_label:", opt_cluster_label
	print "opt_num_cluster:", len(set(opt_cluster_label))


	# print "distance between 1 and 4, should be quite small:", clustering_worker.trajectoryDissimilarityL2( \
	# 	all_OD_trajectories_XY[1], all_OD_trajectories_XY[4])
	# print "distance between 0 and 4, should be quite large:", clustering_worker.trajectoryDissimilarityL2( \
	# 	all_OD_trajectories_XY[0], all_OD_trajectories_XY[4])
	# print "center of mass measure distance between 1 and 4, should be quite small:", clustering_worker.trajectoryDissimilarityCenterMass( \
	# 	all_OD_trajectories_XY[1], all_OD_trajectories_XY[4])
	# print "center of mass measure distance between 0 and 4, should be quite large:", clustering_worker.trajectoryDissimilarityCenterMass( \
	# 	all_OD_trajectories_XY[0], all_OD_trajectories_XY[4])
	# print "matrix:\n", clustering_worker.getTrajectoryDistanceMatrix(\
	# 	all_OD_trajectories_XY, \
	# 	metric_func = clustering_worker.trajectoryDissimilarityL2)
	# plotter.plotListOfTrajectories(all_OD_trajectories_XY, show = True, clean = True, save = False, fname = "")
	
	"""Construct the endpoints to representative trajectory mapping"""
	endpoints = None
	for filename in filenames:
		this_vessel_endpoints = writeToCSV.readDataFromCSVWithMMSI( \
		root_folder + "/endpoints", \
		"{filename}_endpoints.csv".format(filename = filename[:filename.find(".")]))

		# Append to the total end points
		if(endpoints is None):
			endpoints = this_vessel_endpoints
		else:
			endpoints = np.concatenate((endpoints, this_vessel_endpoints), axis=0)

	cluster_centroids = clustering_worker.getClusterCentroids(opt_cluster_label, all_OD_trajectories_XY)
	cluster_centroids_lat_lon = {} # [cluster_label : centroid] dictionary
	for cluster_label, centroid in cluster_centroids.iteritems():
		cluster_centroids_lat_lon[cluster_label] = convertListOfTrajectoriesToLatLon(reference_lat, reference_lon, \
			[copy.deepcopy(centroid)])[0]
		# writeToCSV.writeDataToCSV(np.asarray(cluster_centroids_lat_lon[cluster_label]), root_folder + "/cleanedData/DEBUGGING", \
		# "refined_centroid_{i}".format(i = cluster_label))

	# flatten
	cluster_centroids_lat_lon_flattened = [point for cluster_label, centroid in cluster_centroids_lat_lon.iteritems() \
	for point in centroid]
	writeToCSV.writeDataToCSV(np.asarray(cluster_centroids_lat_lon_flattened), root_folder + "/cleanedData", \
		"centroids_" + fname)

	"""array of centroids written to .npz"""
	writeToCSV.saveData([centroid for cluster_label, centroid in cluster_centroids_lat_lon.iteritems()], \
		root_folder + "/cleanedData/centroids_arr")

	# raise ValueError("purpose stop for clusering only")

	"""DEBUGGING,using unrefined data"""
	# point_to_examine = (1.2625833, 103.6827)
	# point_to_examine_XY = utils.LatLonToXY(reference_lat,reference_lon,point_to_examine[0], point_to_examine[1])
	# augmented_trajectories_from_point_to_examine_index = []
	# augmented_trajectories_from_point_to_examine = []
	# for i in range(0, len(all_OD_trajectories_XY)):
	# 	trajectory = all_OD_trajectories_XY[i]
	# 	if (np.linalg.norm([ \
	# 		point_to_examine_XY[0] - trajectory[0][utils.data_dict_x_y_coordinate["x"]], \
	# 		point_to_examine_XY[1] - trajectory[0][utils.data_dict_x_y_coordinate["y"]]], 2) < utils.NEIGHBOURHOOD_ENDPOINT):
	# 		augmented_trajectories_from_point_to_examine_index.append(i)
	# 		augmented_trajectories_from_point_to_examine.append(trajectory)
	# 		print "augmented_trajectories_from_point_to_examine_index:", augmented_trajectories_from_point_to_examine_index, \
	# 		"starting pos:", trajectory[0][utils.data_dict_x_y_coordinate["x"]], trajectory[0][utils.data_dict_x_y_coordinate["y"]] 
	# print "augmented_trajectories_from_point_to_examine_index:", augmented_trajectories_from_point_to_examine_index



	# augmented_trajectories_from_point_to_examine = convertListOfTrajectoriesToLatLon(reference_lat, reference_lon, copy.deepcopy(augmented_trajectories_from_point_to_examine))
	# for t in range(0, len(augmented_trajectories_from_point_to_examine)):
	# 	writeToCSV.writeDataToCSV(np.asarray(augmented_trajectories_from_point_to_examine[t]), root_folder + "/cleanedData/DEBUGGING", \
	# 	"DEBUGGING_augmented_{t}".format(t = augmented_trajectories_from_point_to_examine_index[t]))


	# augmented_trajectories_from_point_to_examine_clusters = []
	# for i in augmented_trajectories_from_point_to_examine_index:
	# 	augmented_trajectories_from_point_to_examine_clusters.append(opt_cluster_label[i])
	# augmented_trajectories_from_point_to_examine_clusters_unique = list(set(augmented_trajectories_from_point_to_examine_clusters))


	# class_trajectories_dict = clustering_worker.formClassTrajectoriesDict(opt_cluster_label, all_OD_trajectories_XY)

	# for i in augmented_trajectories_from_point_to_examine_clusters_unique:
	# 	writeToCSV.writeDataToCSV(np.asarray(cluster_centroids_lat_lon[i]), root_folder + "/cleanedData/DEBUGGING", \
	# 	"DEBUGGING_centroid_{i}".format(i = i))
	# 	print "cluster_centroids[{i}], starting point:".format(i = i), cluster_centroids[i][0]

	# 	"""save all trajectories under this cluster i """
	# 	class_trajectories = class_trajectories_dict[i]
	# 	class_trajectories_lat_lon = convertListOfTrajectoriesToLatLon(reference_lat, reference_lon, copy.deepcopy(class_trajectories))
	# 	for j in range(0, len(class_trajectories_lat_lon)):
	# 		print "class_trajectories[{i}], starting point:".format(i = i), class_trajectories[j][0]
	# 		writeToCSV.writeDataToCSV(np.asarray(class_trajectories_lat_lon[j]), \
	# 			utils.queryPath(root_folder + "/cleanedData/DEBUGGING/CLASS{i}".format(i = i)) , \
	# 			"DEBUGGING_class_{i}_trajectory_{j}".format(i = i , j = j))

	"""END DEBUGGING"""


	endpoints_cluster_dict = endPointsToRepresentativeTrajectoryMapping(\
		endpoints, \
		all_OD_trajectories_XY , \
		opt_cluster_label, \
		reference_lat, \
		reference_lon)

	empty_endpoints = []
	augmented_index_to_extra_label_mapping = {} # mapping from normal index to appended index in all_protocol_trajectories
	cluster_label_to_cluster_size = {} # 'cluster size' of the appended augmented trajectory in all_protocol_trajectories
	
	all_protocol_trajectories = [] # indexed by cluster label (offset by 1, cluster 1 -> all_protocol_trajectories[0])
	for label in range(np.min(opt_cluster_label), np.max(opt_cluster_label) + 1):
		assert (label in cluster_centroids_lat_lon), "{label} is supposed to be in the cluster_centroids_lat_lon dict".format(label = label)
		all_protocol_trajectories.append(cluster_centroids_lat_lon[label])
		cluster_label_to_cluster_size[label - 1] = len(np.where(opt_cluster_label == label)[0])
	assert(np.sum([size for label, size in cluster_label_to_cluster_size.iteritems()]) == len(opt_cluster_label)), "sum of individual label size should == total count"

	"""
	assign augmented trajectories to empty endpoints: True/False
	"""	
	assign_augmented_to_empty_enpoints_flag = False
	DEBUG_APPEND_INDEXS = []
	if (assign_augmented_to_empty_enpoints_flag):
		for endpoint_str, endpoint_tuple_list in endpoints_cluster_dict.iteritems():
			endpoint_starting_clusters = [item.cluster for item in endpoint_tuple_list] # get the list of cluster_labels of centroids to a certain endpoint

			if (len(endpoint_starting_clusters) == 0):
				"""If no centroid assigned, then assign the original augmented trajectory"""
				this_empty_endpoint = lookForEndPoints(endpoints, endpoint_str) # endpoints is in lat, lon
				if (this_empty_endpoint is None):
					raise ValueError("Error! should always be able to map back endpoints, but {p} is not found".format(p = endpoint_str))
				empty_endpoints.append(this_empty_endpoint)

				point_to_examine_XY = utils.LatLonToXY(reference_lat,reference_lon, \
					this_empty_endpoint[utils.dataDict["latitude"]], this_empty_endpoint[utils.dataDict["longitude"]])
				augmented_trajectories_from_point_to_examine_index = []
				augmented_trajectories_from_point_to_examine = []
				for i in range(0, len(all_OD_trajectories_XY)):
					trajectory = all_OD_trajectories_XY[i]
					if (np.linalg.norm([ \
						point_to_examine_XY[0] - trajectory[0][utils.data_dict_x_y_coordinate["x"]], \
						point_to_examine_XY[1] - trajectory[0][utils.data_dict_x_y_coordinate["y"]]], 2) < utils.NEIGHBOURHOOD_ENDPOINT):
						augmented_trajectories_from_point_to_examine_index.append(i)
						augmented_trajectories_from_point_to_examine.append(trajectory)
						# print "this found augmented_trajectories_from_point_to_examine_index:", \
						# augmented_trajectories_from_point_to_examine_index, \
						# "starting pos:", \
						# trajectory[0][utils.data_dict_x_y_coordinate["x"]], \
						# trajectory[0][utils.data_dict_x_y_coordinate["y"]] 
				print "all indexes (w.r.t all_OD_trajectories_XY) for this_empty_endpoint:", augmented_trajectories_from_point_to_examine_index

				DEBUG_APPEND_INDEXS.append(augmented_trajectories_from_point_to_examine_index)

				"""Append augmented_trajectories_from_point_to_examine to end of array of centroids and give extra label"""
				for augmented_index in augmented_trajectories_from_point_to_examine_index:
					if (not augmented_index in augmented_index_to_extra_label_mapping): 
						# if this normal trajectory is not appened, append it and mark in the augmented_index_to_extra_label_mapping
						augmented_index_to_extra_label_mapping[augmented_index] = len(all_protocol_trajectories)
						cluster_label_to_cluster_size[augmented_index_to_extra_label_mapping[augmented_index]] = 1
						all_protocol_trajectories.append(\
							convertListOfTrajectoriesToLatLon(reference_lat, reference_lon, \
								[copy.deepcopy(all_OD_trajectories_XY[augmented_index])])[0])
					else:
						cluster_label_to_cluster_size[augmented_index_to_extra_label_mapping[augmented_index]] += 1

					endpoints_cluster_dict[endpoint_str].append(utils.ClusterCentroidTuple(\
						cluster = augmented_index_to_extra_label_mapping[augmented_index], \
						centroid = all_protocol_trajectories[augmented_index_to_extra_label_mapping[augmented_index]]))

	"""Asserting and Saving of info for Agent Based Simulator"""
	assert (len(set([index for index_list in DEBUG_APPEND_INDEXS for index in index_list])) == \
		len(all_protocol_trajectories) - len(set(opt_cluster_label))), \
	"size of appended augmented trajectories should == len(DEBUG_APPEND_INDEXS)" 

	for index in range(0, len(all_protocol_trajectories)):
		assert(index in cluster_label_to_cluster_size), "all_protocol_trajectories's index mapping to cluster should be complete"
	
	for label, size in cluster_label_to_cluster_size.iteritems():
		print "label, size:", label, size

	print "number of endpoints that do not have clusters assigned to:", len(empty_endpoints)
	print "total number of endpoints:", len(endpoints)
	writeToCSV.writeDataToCSVWithMMSI(np.asarray(endpoints), root_folder + "/endpoints", "all_endpoints_with_MMSI")
	writeToCSV.writeDataToCSV(np.asarray(empty_endpoints), root_folder + "/cleanedData", \
		"non_starting_endpoints_{root_folder}_dissimilarity_l2_cophenetic_distance_cleaned".format(root_folder = root_folder))
	writeToCSV.saveData([endpoints_cluster_dict], \
		filename = root_folder + "/cleanedData" + "/endpoints_cluster_dict" + fname)

	"""write all the all_protocol_trajectories for DEBUGGING purpose"""
	for i in range(0, len(all_protocol_trajectories)):
		protocol_trajectory = all_protocol_trajectories[i]
		writeToCSV.writeDataToCSV(\
			np.asarray(protocol_trajectory), \
			utils.queryPath(root_folder + "/cleanedData/DEBUGGING/ALL_PROTOCOLS_PATTERN_ONLY"), \
			"all_protocol_{i}".format(i = i))

	"""Save related csv files for Agent Based Simulator"""
	writeToCSV.writeAllProtocolTrajectories(\
		path = utils.queryPath(root_folder+"LearningResult"), \
		file_name = "protocol_trajectories_with_cluster_size", \
		all_protocol_trajectories = all_protocol_trajectories, \
		cluster_label_to_cluster_size = cluster_label_to_cluster_size)

	writeToCSV.writeEndPointsToProtocolTrajectoriesIndexesWithMMSI(\
		path = utils.queryPath(root_folder+"LearningResult"), \
		file_name = "endpoints_to_protocol_trajectories", \
		endpoints = endpoints, \
		endpoints_cluster_dict = endpoints_cluster_dict)
def main():
    metric_to_use = int(raw_input("use metric?\n" + "1. l2\n" + "2. center of mass\n"))
    root_folder = "tankers/out_sample_test"
    """read centroids"""
    centroids = None
    if metric_to_use == 1:
        centroids = writeToCSV.loadData("tankers/cleanedData/centroids_arr_l2.npz")
    elif metric_to_use == 2:
        centroids = writeToCSV.loadData("tankers/cleanedData/centroids_arr_center_mass.npz")

    """Extract endpoints, trajectories, augmentation"""
    filenames = [
        "9050462.csv",
        "9259769.csv",
        "9327138.csv",
        "9408475.csv",
        "9417464.csv",
        "9548440.csv",
    ]  # for out sample test
    # filenames = ["9408475.csv"]
    endpoints = None
    all_OD_trajectories = []
    """Do the augmentation if not yet done"""
    if not os.path.exists(root_folder + "/all_OD_trajectories_with_1D_data.npz"):
        for i in range(0, len(filenames)):
            this_vessel_trajectory_points = writeToCSV.readDataFromCSV(root_folder + "/cleanedData", filenames[i])
            # Extract end points, along with MMSI
            this_vessel_endpoints = np.asarray(
                trajectory_modeller.extractEndPoints(
                    writeToCSV.readDataFromCSVWithMMSI(root_folder + "/cleanedData", filenames[i])
                )
            )
            # Save end points, along with MMSI
            writeToCSV.writeDataToCSVWithMMSI(
                this_vessel_endpoints,
                utils.queryPath(root_folder + "/endpoints"),
                "{filename}_endpoints".format(filename=filenames[i][: filenames[i].find(".")]),
            )
            print "this_vessel_endpoints.shape:", this_vessel_endpoints.shape

            # Append to the total end points
            if endpoints is None:
                endpoints = this_vessel_endpoints
            else:
                endpoints = np.concatenate((endpoints, this_vessel_endpoints), axis=0)

            for s in range(0, len(this_vessel_endpoints) - 1):
                originLatitude = this_vessel_endpoints[s][utils.dataDict["latitude"]]
                originLongtitude = this_vessel_endpoints[s][utils.dataDict["longitude"]]
                origin_ts = this_vessel_endpoints[s][utils.dataDict["ts"]]

                endLatitude = this_vessel_endpoints[s + 1][utils.dataDict["latitude"]]
                endLongtitude = this_vessel_endpoints[s + 1][utils.dataDict["longitude"]]
                end_ts = this_vessel_endpoints[s + 1][utils.dataDict["ts"]]

                """Extracting trajectory between a pair of OD"""
                print "\n\nextracting endpoints between ", s, " and ", s + 1
                OD_trajectories, OD_trajectories_lat_lon = trajectory_modeller.extractTrajectoriesUntilOD(
                    this_vessel_trajectory_points,
                    origin_ts,
                    originLatitude,
                    originLongtitude,
                    end_ts,
                    endLatitude,
                    endLongtitude,
                    show=False,
                    save=True,
                    clean=False,
                    fname=filenames[i][: filenames[i].find(".")]
                    + "_trajectory_between_endpoint{s}_and{e}".format(s=s, e=s + 1),
                    path=utils.queryPath(root_folder + "/plots"),
                )
                # there will be one trajectory between each OD
                assert len(OD_trajectories) > 0, "OD_trajectories extracted must have length > 0"
                print "number of trajectory points extracted : ", len(OD_trajectories[0])

                if (
                    len(OD_trajectories[0]) > 2
                ):  # more than just the origin and destination endpoints along the trajectory
                    writeToCSV.writeDataToCSV(
                        data=OD_trajectories_lat_lon[0],
                        path=utils.queryPath(root_folder + "/trajectories"),
                        file_name="{filename}_trajectory_endpoint_{s}_to_{e}".format(
                            filename=filenames[i][: filenames[i].find(".")], s=s, e=s + 1
                        ),
                    )
                    """
					Interpolation based on pure geographical trajectory, ignore temporal information
					"""
                    interpolated_OD_trajectories = interpolator.geographicalTrajetoryInterpolation(OD_trajectories)
                    plotter.plotListOfTrajectories(
                        interpolated_OD_trajectories,
                        show=False,
                        clean=True,
                        save=True,
                        fname=filenames[i][: filenames[i].find(".")]
                        + "_interpolated_algo_3_between_endpoint{s}_and{e}".format(s=s, e=s + 1),
                        path=utils.queryPath(root_folder + "/plots"),
                    )

                    """
					Interpolation of 1D data: speed, rate_of_turn, etc; interpolated_OD_trajectories / OD_trajectories are both in X, Y coordinates
					"""
                    if len(interpolated_OD_trajectories) > 0:
                        interpolated_OD_trajectories[0] = interpolator.interpolate1DFeatures(
                            interpolated_OD_trajectories[0], OD_trajectories[0]
                        )

                        # change X, Y coordinate to Lat, Lon
                    interpolated_OD_trajectories_lat_lon = trajectory_modeller.convertListOfTrajectoriesToLatLon(
                        originLatitude, originLongtitude, interpolated_OD_trajectories
                    )
                    if len(interpolated_OD_trajectories_lat_lon) > 0:
                        # since there should be only one trajectory between each pair of OD
                        all_OD_trajectories.append(interpolated_OD_trajectories_lat_lon[0])
                else:
                    print "no trajectories extracted between endpoints ", s, " and ", s + 1
                    plt.clf()

        assert not endpoints is None, "Error!: No endpoints extracted from the historial data of vessels" + "_".join(
            filenames
        )
        print "Final endpoints.shape:", endpoints.shape
        print "number of interpolated all_OD_trajectories:", len(all_OD_trajectories)
        all_OD_trajectories = utils.removeErrorTrajectoryFromList(all_OD_trajectories)
        writeToCSV.saveData(all_OD_trajectories, root_folder + "/all_OD_trajectories_with_1D_data")
    else:
        all_OD_trajectories = writeToCSV.loadData(root_folder + "/all_OD_trajectories_with_1D_data.npz")

    """convert Lat, Lon to XY for displaying"""
    all_OD_trajectories_XY = trajectory_modeller.convertListOfTrajectoriesToXY(
        utils.CENTER_LAT_SG, utils.CENTER_LON_SG, all_OD_trajectories
    )
    plotter.plotListOfTrajectories(
        all_OD_trajectories_XY,
        show=True,
        clean=True,
        save=False,
        fname="out_sample_tanker_all_OD_trajectories",
        path=utils.queryPath(root_folder + "/plots"),
    )

    """Test distance to cluster centroids"""
    centroids_XY = trajectory_modeller.convertListOfTrajectoriesToXY(
        utils.CENTER_LAT_SG, utils.CENTER_LON_SG, centroids
    )

    for i in range(0, len(all_OD_trajectories_XY)):
        this_tr_XY = all_OD_trajectories_XY[i]
        if metric_to_use == 1:
            this_tr_centroids_dist, according_pattern_index = minDistanceAgainstCentroids(
                this_tr_XY, centroids_XY, clustering_worker.trajectoryDissimilarityL2
            )
            print "augmented trajectories[{i}]".format(
                i=i
            ), "'s best l2 distance is against cluster centroids[{i}], = ".format(
                i=according_pattern_index
            ), this_tr_centroids_dist, ", max allowed distance  = ", 1000
        elif metric_to_use == 2:
            this_tr_centroids_dist, according_pattern_index = minDistanceAgainstCentroids(
                this_tr_XY, centroids_XY, clustering_worker.trajectoryDissimilarityCenterMass
            )
            print "augmented trajectories[{i}]".format(
                i=i
            ), "'s best center of mass distance is against cluster centroids[{i}], = ".format(
                i=according_pattern_index
            ), this_tr_centroids_dist, ", max allowed distance  = ", 1.5

            # plotter.plotFeatureSpace(centroids[according_pattern_index])
            # plotter.plotFeatureSpace(\
            # trajectory_modeller.convertListOfTrajectoriesToLatLon(utils.CENTER_LAT_SG, utils.CENTER_LON_SG, [this_tr_XY])[0])

    return