forked from cynsky/vessel-trajectory-modeller
/
trajectory_modeller.py
728 lines (624 loc) · 35.7 KB
/
trajectory_modeller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
# Author: Xing Yifan A0105591J
import numpy as np
import math
import scipy.io as sio
import scipy.sparse as sp
from scipy.sparse import csc_matrix
import csv
import matplotlib.pyplot as plt
import datetime
import time
import os
from scipy import interpolate
import scipy.spatial.distance as distance
from collections import OrderedDict
import writeToCSV
import copy
import sys
import scipy.spatial.distance as DIST
import scipy.cluster.hierarchy as HAC
import random
from sklearn import metrics
import operator
import utils
import plotter
import interpolator
import clustering_worker
import compute_mindistance
class Point(object):
def __init__(self,_x,_y):
self.x = _x
self.y = _y
def boxMeanTrajectoryPoints(trajectory_points, reference_lat, reference_lon):
"""
trajectory_points: in lat, lon;
returns: geographical mean of these points in lat, lon
"""
trajectory_points_XY = convertListOfTrajectoriesToXY(reference_lat, reference_lon, [trajectory_points])[0]
trajectory_points_XY_mean = np.mean(trajectory_points_XY, axis = 0)
lat, lon = utils.XYToLatLonGivenOrigin(reference_lat, reference_lon, \
trajectory_points_XY_mean[utils.data_dict_x_y_coordinate["x"]], trajectory_points_XY_mean[utils.data_dict_x_y_coordinate["y"]])
trajectory_points_XY_mean[utils.dataDict["latitude"]] = lat
trajectory_points_XY_mean[utils.dataDict["longitude"]] = lon
return trajectory_points_XY_mean
def extractTrajectoriesUntilOD(data, originTS, originLatitude, originLongtitude, endTS, endLatitude, endLongtitude, show = True, save = False, clean = False, fname = "", path = "plots"):
"""
returns: OD_trajectories: in x,y coordinate;
OD_trajectories_lat_lon: in lat, lon coordinate;
"""
maxSpeed = 0
for i in range(0, data.shape[0]):
speed_over_ground = data[i][utils.dataDict["speed_over_ground"]]
if(speed_over_ground > maxSpeed and speed_over_ground != 102.3): #1023 indicates speed not available
maxSpeed = speed_over_ground
print "This tanker maxSpeed:", maxSpeed, " knot"
OD_trajectories = [] # origin destination endpoints trajectory
i = 0
while(i< data.shape[0]):
cur_pos = data[i]
if(utils.nearOrigin( \
originLatitude, \
originLongtitude, \
cur_pos[utils.dataDict["latitude"]], \
cur_pos[utils.dataDict["longitude"]], \
thresh = 0.0) and \
cur_pos[utils.dataDict["ts"]] == originTS): # must be exact point
this_OD_trajectory = []
this_OD_trajectory.append(cur_pos)
i += 1
while(i < data.shape[0] and \
(not utils.nearOrigin( \
endLatitude, \
endLongtitude, \
data[i][utils.dataDict["latitude"]], \
data[i][utils.dataDict["longitude"]], \
thresh = 0.0))):
this_OD_trajectory.append(data[i])
i += 1
if(i < data.shape[0]):
this_OD_trajectory.append(data[i]) # append the destination endpoint
this_OD_trajectory = np.asarray(this_OD_trajectory) # make it to be an np 2D array
""" box/radius approach in cleaning of points around origin"""
j = 1
print "checking points around origin:", j
while(j < this_OD_trajectory.shape[0] and \
utils.nearOrigin( \
originLatitude, \
originLongtitude, \
this_OD_trajectory[j][utils.dataDict["latitude"]], \
this_OD_trajectory[j][utils.dataDict["longitude"]], \
thresh = utils.NEIGHBOURHOOD_ORIGIN)):
j += 1
print "last point around origin:", j
this_OD_trajectory_around_origin = this_OD_trajectory[0:j]
"""Take the box mean, treat timestamp as averaged as well"""
this_OD_trajectory_mean_origin = boxMeanTrajectoryPoints(this_OD_trajectory_around_origin, originLatitude, originLongtitude)
print "mean start point x,y : ", utils.LatLonToXY( \
originLatitude, \
originLongtitude, \
this_OD_trajectory_mean_origin[utils.dataDict["latitude"]], \
this_OD_trajectory_mean_origin[utils.dataDict["longitude"]])
OD_trajectories.append(np.insert(this_OD_trajectory[j:],0,this_OD_trajectory_mean_origin, axis = 0))
break # only one trajectory per pair OD, since OD might be duplicated
i += 1
OD_trajectories = np.array(OD_trajectories)
OD_trajectories_lat_lon = copy.deepcopy(OD_trajectories)
for i in range(0, len(OD_trajectories)):
for j in range(0, len(OD_trajectories[i])):
x, y = utils.LatLonToXY(originLatitude, originLongtitude, OD_trajectories[i][j][utils.dataDict["latitude"]], OD_trajectories[i][j][utils.dataDict["longitude"]])
OD_trajectories[i][j][utils.data_dict_x_y_coordinate["y"]] = y
OD_trajectories[i][j][utils.data_dict_x_y_coordinate["x"]] = x
# plotting purpose
plt.scatter(OD_trajectories[i][0:len(OD_trajectories[i]),utils.data_dict_x_y_coordinate["x"]], \
OD_trajectories[i][0:len(OD_trajectories[i]),utils.data_dict_x_y_coordinate["y"]])
if(not plt.gca().yaxis_inverted()):
plt.gca().invert_yaxis()
if(save):
plt.savefig("./{path}/{fname}.png".format(path = path, fname = fname))
if(show):
plt.show()
if(clean):
plt.clf()
return OD_trajectories, OD_trajectories_lat_lon
def getDistance(point1, point2):
dx, dy = utils.LatLonToXY(point1[utils.dataDict["latitude"]], point1[utils.dataDict["longitude"]], point2[utils.dataDict["latitude"]], point2[utils.dataDict["longitude"]])
return (np.linalg.norm([dx,dy],2))
def alreadyInEndpoints(endpoints, target):
for i in range(0, len(endpoints)):
if(getDistance(endpoints[i], target) < utils.NEIGHBOURHOOD_ENDPOINT):
return True
return False
def extractEndPoints(data):
"""
Note: if the trajectory is discontinued because out of detection range, add that last point before out of range, and the new point in range as end point as well
TODO: further cleaning of data is needed to extract better end points, eg. 8514019.csv end point 1,2 are actually of the same place but 3 is added due to error point
"""
endpoints = []
print "data.shape:",data.shape
if (len(data) > 0):
endpoints.append(data[0]) # assume first point is an endpoint
i = 0
while(i< data.shape[0]):
start_point = data[i]
start_index = i
"""Find the next_point that marks the departure from endpoint"""
while(i+1<data.shape[0]):
next_point = data[i+1]
"""
If inter point distance > thresh and is not error signal (speed is indeed> 0)
Or
inter point time difference > thesh
"""
if((getDistance(start_point, next_point) > utils.NEIGHBOURHOOD_ENDPOINT \
and next_point[utils.dataDict["speed_over_ground"]] > 0) or \
(next_point[utils.dataDict["ts"]] - start_point[utils.dataDict["ts"]] > utils.BOUNDARY_TIME_DIFFERENCE) \
and i == start_index # immediate point after start point
):
# print "found a point that is out of utils.NEIGHBOURHOOD_ENDPOINT:", datetime.datetime.fromtimestamp(start_point[utils.dataDict["ts"]]).strftime('%Y-%m-%dT%H:%M:%SZ'), \
# datetime.datetime.fromtimestamp(next_point[utils.dataDict["ts"]]).strftime('%Y-%m-%dT%H:%M:%SZ')
break;
i += 1
next_point = data[i] # back track to get the last data point that is still near start_point
if(i - start_index > 0 and next_point[utils.dataDict["ts"]] - start_point[utils.dataDict["ts"]] > utils.STAYTIME_THRESH):
if(len(endpoints) == 0 or (not (endpoints[len(endpoints) - 1] == start_point).all())): # if not just appended
endpoints.append(start_point)
elif((i+1) != data.shape[0]): # check boundary case
"""TODO: is there a boundary informaiton on the area that AIS can detect?"""
next_point_outside_neighbour = data[i+1]
if(next_point_outside_neighbour[utils.dataDict["ts"]] - start_point[utils.dataDict["ts"]] > utils.BOUNDARY_TIME_DIFFERENCE and \
(next_point_outside_neighbour[utils.dataDict["speed_over_ground"]] != 0 or \
start_point[utils.dataDict["speed_over_ground"]] != 0)): # if start of new trajectory at a new position after some time, boundary case, (one of the speed should not be zero)
# if(next_point_outside_neighbour[utils.dataDict["ts"]] - start_point[utils.dataDict["ts"]] > \
# getDistance(start_point, next_point_outside_neighbour)/ \
# (1*utils.KNOTTOKMPERHOUR) * 3600): #maximum knot
print "append both, since start of new trajectory:", datetime.datetime.fromtimestamp(next_point[utils.dataDict["ts"]]).strftime('%Y-%m-%dT%H:%M:%SZ'), datetime.datetime.fromtimestamp(next_point_outside_neighbour[utils.dataDict["ts"]]).strftime('%Y-%m-%dT%H:%M:%SZ')
if(len(endpoints) == 0 or (not (endpoints[len(endpoints) - 1] == next_point).all())): # if not just appended
endpoints.append(next_point)
if(len(endpoints) == 0 or (not (endpoints[len(endpoints) - 1] == next_point_outside_neighbour).all())): # if not just appended
endpoints.append(next_point_outside_neighbour)
elif((i+1) == data.shape[0]):
if(len(endpoints) == 0 or (not (endpoints[len(endpoints) - 1] == next_point).all())): # if not just appended
endpoints.append(next_point) # last point in the .csv record, should be an end point
i += 1
return endpoints
def convertListOfTrajectoriesToLatLon(originLatitude, originLongtitude, listOfTrajectories):
for i in range(0, len(listOfTrajectories)):
for j in range(0, len(listOfTrajectories[i])):
lat, lon = utils.XYToLatLonGivenOrigin(originLatitude, originLongtitude, listOfTrajectories[i][j][utils.data_dict_x_y_coordinate["x"]], listOfTrajectories[i][j][utils.data_dict_x_y_coordinate["y"]])
listOfTrajectories[i][j][utils.dataDict["latitude"]] = lat
listOfTrajectories[i][j][utils.dataDict["longitude"]] = lon
return listOfTrajectories
def convertListOfTrajectoriesToXY(originLatitude, originLongtitude, listOfTrajectories):
for i in range(0, len(listOfTrajectories)):
for j in range(0, len(listOfTrajectories[i])):
x, y = utils.LatLonToXY(originLatitude, originLongtitude, listOfTrajectories[i][j][utils.dataDict["latitude"]], listOfTrajectories[i][j][utils.dataDict["longitude"]])
listOfTrajectories[i][j][utils.data_dict_x_y_coordinate["y"]] = y
listOfTrajectories[i][j][utils.data_dict_x_y_coordinate["x"]] = x
return listOfTrajectories
def endPointMatchTrajectoryCentroid(endpoint, centroid, reference_lat, reference_lon):
assert (len(centroid) > 0), "cluster centroid must be non empty"
x, y = utils.LatLonToXY(reference_lat,reference_lon,endpoint[utils.dataDict["latitude"]], endpoint[utils.dataDict["longitude"]])
centroid_start_x = centroid[0][utils.data_dict_x_y_coordinate["x"]]
centroid_start_y = centroid[0][utils.data_dict_x_y_coordinate["y"]]
if (np.linalg.norm([x - centroid_start_x, y - centroid_start_y], 2) < 20 * utils.NEIGHBOURHOOD_ENDPOINT):
return True
else:
return False
def endPointsToRepresentativeTrajectoryMapping(endpoints, trajectories, cluster_label, reference_lat, reference_lon):
"""
trajectories: in XY coordinate by reference_lat, reference_lon
endpoints: in lat, lon
cluster_label: array of cluster label w.r.t array of trajectories, starting with cluster index 1
"""
endpoints_cluster_dict = {}
class_trajectories_dict = clustering_worker.formClassTrajectoriesDict(cluster_label = cluster_label, data = trajectories)
cluster_centroids_dict = {} # [cluster:centroid] dictionary
for class_label, trajectories in class_trajectories_dict.iteritems():
cluster_centroids_dict[class_label] =clustering_worker.getMeanTrajecotoryWithinClass(trajectories)
for endpoint in endpoints:
if (not "{lat}_{lon}".format(lat = endpoint[utils.dataDict["latitude"]], \
lon = endpoint[utils.dataDict["longitude"]]) in endpoints_cluster_dict):
endpoints_cluster_dict["{lat}_{lon}".format(lat = endpoint[utils.dataDict["latitude"]], \
lon = endpoint[utils.dataDict["longitude"]])] = []
for cluster, centroid in cluster_centroids_dict.iteritems():
if (endPointMatchTrajectoryCentroid(endpoint, centroid, reference_lat, reference_lon)):
endpoints_cluster_dict["{lat}_{lon}".format(lat = endpoint[utils.dataDict["latitude"]], \
lon = endpoint[utils.dataDict["longitude"]])].append(utils.ClusterCentroidTuple(cluster = cluster - 1, centroid = centroid)) # offset by 1
return endpoints_cluster_dict
def lookForEndPoints(endpoints, endpoint_str):
for endpoint in endpoints:
if ("{lat}_{lon}".format(lat = endpoint[utils.dataDict["latitude"]], \
lon = endpoint[utils.dataDict["longitude"]]) == endpoint_str):
return endpoint
return None
def executeClustering(root_folder, all_OD_trajectories_XY, reference_lat, reference_lon, filenames):
fname = "{root_folder}_dissimilarity_l2_cophenetic_distance".format(root_folder = root_folder)
# fname = "10_tankers_dissimilarity_l2_inconsistent_refined_endpoints"
# fname = "10_tankers_dissimilarity_l2_cophenetic_distance_refined_endpoints"
# fname = "10_tankers_dissimilarity_center_mass_cophenetic_distance_refined_endpoints"
# fname = "10_tankers_dissimilarity_l2_inconsistent"
# fname = "10_tankers_dissimilarity_l2_all_K"
# fname = "10_tankers_dissimilarity_center_mass"
# fname = "10_tankers_dissimilarity_center_mass_cophenetic_distance_cleaned"
# fname = "10_tankers_dissimilarity_center_mass_inconsistent_cleaned"
opt_cluster_label , cluster_labels, CH_indexes = clustering_worker.clusterTrajectories( \
trajectories = all_OD_trajectories_XY, \
fname = fname, \
path = utils.queryPath("tankers/cluster_result/{folder}".format(folder = fname)), \
metric_func = clustering_worker.trajectoryDissimilarityL2, \
# metric_func = clustering_worker.trajectoryDissimilarityCenterMass, \
# user_distance_matrix = writeToCSV.loadData(root_folder + \
# "/cluster_result/10_tankers_dissimilarity_center_mass/10_tankers_dissimilarity_center_mass_cleaned.npz"), \
# user_distance_matrix = writeToCSV.loadData(root_folder + \
# "/cluster_result/10_tankers_dissimilarity_l2_cophenetic_distance_cleaned/10_tankers_dissimilarity_l2_cophenetic_distance_cleaned.npz"), \
# user_distance_matrix = writeToCSV.loadData(root_folder + \
# "/cluster_result/10_tankers_dissimilarity_l2_cophenetic_distance_refined_endpoints" + \
# "/10_tankers_dissimilarity_l2_cophenetic_distance_refined_endpoints.npz"), \
# user_distance_matrix = writeToCSV.loadData(root_folder + \
# "/cluster_result/10_tankers_dissimilarity_center_mass_cophenetic_distance_refined_endpoints" + \
# "/10_tankers_dissimilarity_center_mass_cophenetic_distance_refined_endpoints.npz"), \
criterion = 'distance')
print "opt_cluster_label:", opt_cluster_label
print "opt_num_cluster:", len(set(opt_cluster_label))
# print "distance between 1 and 4, should be quite small:", clustering_worker.trajectoryDissimilarityL2( \
# all_OD_trajectories_XY[1], all_OD_trajectories_XY[4])
# print "distance between 0 and 4, should be quite large:", clustering_worker.trajectoryDissimilarityL2( \
# all_OD_trajectories_XY[0], all_OD_trajectories_XY[4])
# print "center of mass measure distance between 1 and 4, should be quite small:", clustering_worker.trajectoryDissimilarityCenterMass( \
# all_OD_trajectories_XY[1], all_OD_trajectories_XY[4])
# print "center of mass measure distance between 0 and 4, should be quite large:", clustering_worker.trajectoryDissimilarityCenterMass( \
# all_OD_trajectories_XY[0], all_OD_trajectories_XY[4])
# print "matrix:\n", clustering_worker.getTrajectoryDistanceMatrix(\
# all_OD_trajectories_XY, \
# metric_func = clustering_worker.trajectoryDissimilarityL2)
# plotter.plotListOfTrajectories(all_OD_trajectories_XY, show = True, clean = True, save = False, fname = "")
"""Construct the endpoints to representative trajectory mapping"""
endpoints = None
for filename in filenames:
this_vessel_endpoints = writeToCSV.readDataFromCSVWithMMSI( \
root_folder + "/endpoints", \
"{filename}_endpoints.csv".format(filename = filename[:filename.find(".")]))
# Append to the total end points
if(endpoints is None):
endpoints = this_vessel_endpoints
else:
endpoints = np.concatenate((endpoints, this_vessel_endpoints), axis=0)
cluster_centroids = clustering_worker.getClusterCentroids(opt_cluster_label, all_OD_trajectories_XY)
cluster_centroids_lat_lon = {} # [cluster_label : centroid] dictionary
for cluster_label, centroid in cluster_centroids.iteritems():
cluster_centroids_lat_lon[cluster_label] = convertListOfTrajectoriesToLatLon(reference_lat, reference_lon, \
[copy.deepcopy(centroid)])[0]
# writeToCSV.writeDataToCSV(np.asarray(cluster_centroids_lat_lon[cluster_label]), root_folder + "/cleanedData/DEBUGGING", \
# "refined_centroid_{i}".format(i = cluster_label))
# flatten
cluster_centroids_lat_lon_flattened = [point for cluster_label, centroid in cluster_centroids_lat_lon.iteritems() \
for point in centroid]
writeToCSV.writeDataToCSV(np.asarray(cluster_centroids_lat_lon_flattened), root_folder + "/cleanedData", \
"centroids_" + fname)
"""array of centroids written to .npz"""
writeToCSV.saveData([centroid for cluster_label, centroid in cluster_centroids_lat_lon.iteritems()], \
root_folder + "/cleanedData/centroids_arr")
# raise ValueError("purpose stop for clusering only")
"""DEBUGGING,using unrefined data"""
# point_to_examine = (1.2625833, 103.6827)
# point_to_examine_XY = utils.LatLonToXY(reference_lat,reference_lon,point_to_examine[0], point_to_examine[1])
# augmented_trajectories_from_point_to_examine_index = []
# augmented_trajectories_from_point_to_examine = []
# for i in range(0, len(all_OD_trajectories_XY)):
# trajectory = all_OD_trajectories_XY[i]
# if (np.linalg.norm([ \
# point_to_examine_XY[0] - trajectory[0][utils.data_dict_x_y_coordinate["x"]], \
# point_to_examine_XY[1] - trajectory[0][utils.data_dict_x_y_coordinate["y"]]], 2) < utils.NEIGHBOURHOOD_ENDPOINT):
# augmented_trajectories_from_point_to_examine_index.append(i)
# augmented_trajectories_from_point_to_examine.append(trajectory)
# print "augmented_trajectories_from_point_to_examine_index:", augmented_trajectories_from_point_to_examine_index, \
# "starting pos:", trajectory[0][utils.data_dict_x_y_coordinate["x"]], trajectory[0][utils.data_dict_x_y_coordinate["y"]]
# print "augmented_trajectories_from_point_to_examine_index:", augmented_trajectories_from_point_to_examine_index
# augmented_trajectories_from_point_to_examine = convertListOfTrajectoriesToLatLon(reference_lat, reference_lon, copy.deepcopy(augmented_trajectories_from_point_to_examine))
# for t in range(0, len(augmented_trajectories_from_point_to_examine)):
# writeToCSV.writeDataToCSV(np.asarray(augmented_trajectories_from_point_to_examine[t]), root_folder + "/cleanedData/DEBUGGING", \
# "DEBUGGING_augmented_{t}".format(t = augmented_trajectories_from_point_to_examine_index[t]))
# augmented_trajectories_from_point_to_examine_clusters = []
# for i in augmented_trajectories_from_point_to_examine_index:
# augmented_trajectories_from_point_to_examine_clusters.append(opt_cluster_label[i])
# augmented_trajectories_from_point_to_examine_clusters_unique = list(set(augmented_trajectories_from_point_to_examine_clusters))
# class_trajectories_dict = clustering_worker.formClassTrajectoriesDict(opt_cluster_label, all_OD_trajectories_XY)
# for i in augmented_trajectories_from_point_to_examine_clusters_unique:
# writeToCSV.writeDataToCSV(np.asarray(cluster_centroids_lat_lon[i]), root_folder + "/cleanedData/DEBUGGING", \
# "DEBUGGING_centroid_{i}".format(i = i))
# print "cluster_centroids[{i}], starting point:".format(i = i), cluster_centroids[i][0]
# """save all trajectories under this cluster i """
# class_trajectories = class_trajectories_dict[i]
# class_trajectories_lat_lon = convertListOfTrajectoriesToLatLon(reference_lat, reference_lon, copy.deepcopy(class_trajectories))
# for j in range(0, len(class_trajectories_lat_lon)):
# print "class_trajectories[{i}], starting point:".format(i = i), class_trajectories[j][0]
# writeToCSV.writeDataToCSV(np.asarray(class_trajectories_lat_lon[j]), \
# utils.queryPath(root_folder + "/cleanedData/DEBUGGING/CLASS{i}".format(i = i)) , \
# "DEBUGGING_class_{i}_trajectory_{j}".format(i = i , j = j))
"""END DEBUGGING"""
endpoints_cluster_dict = endPointsToRepresentativeTrajectoryMapping(\
endpoints, \
all_OD_trajectories_XY , \
opt_cluster_label, \
reference_lat, \
reference_lon)
empty_endpoints = []
augmented_index_to_extra_label_mapping = {} # mapping from normal index to appended index in all_protocol_trajectories
cluster_label_to_cluster_size = {} # 'cluster size' of the appended augmented trajectory in all_protocol_trajectories
all_protocol_trajectories = [] # indexed by cluster label (offset by 1, cluster 1 -> all_protocol_trajectories[0])
for label in range(np.min(opt_cluster_label), np.max(opt_cluster_label) + 1):
assert (label in cluster_centroids_lat_lon), "{label} is supposed to be in the cluster_centroids_lat_lon dict".format(label = label)
all_protocol_trajectories.append(cluster_centroids_lat_lon[label])
cluster_label_to_cluster_size[label - 1] = len(np.where(opt_cluster_label == label)[0])
assert(np.sum([size for label, size in cluster_label_to_cluster_size.iteritems()]) == len(opt_cluster_label)), "sum of individual label size should == total count"
"""
assign augmented trajectories to empty endpoints: True/False
"""
assign_augmented_to_empty_enpoints_flag = False
DEBUG_APPEND_INDEXS = []
if (assign_augmented_to_empty_enpoints_flag):
for endpoint_str, endpoint_tuple_list in endpoints_cluster_dict.iteritems():
endpoint_starting_clusters = [item.cluster for item in endpoint_tuple_list] # get the list of cluster_labels of centroids to a certain endpoint
if (len(endpoint_starting_clusters) == 0):
"""If no centroid assigned, then assign the original augmented trajectory"""
this_empty_endpoint = lookForEndPoints(endpoints, endpoint_str) # endpoints is in lat, lon
if (this_empty_endpoint is None):
raise ValueError("Error! should always be able to map back endpoints, but {p} is not found".format(p = endpoint_str))
empty_endpoints.append(this_empty_endpoint)
point_to_examine_XY = utils.LatLonToXY(reference_lat,reference_lon, \
this_empty_endpoint[utils.dataDict["latitude"]], this_empty_endpoint[utils.dataDict["longitude"]])
augmented_trajectories_from_point_to_examine_index = []
augmented_trajectories_from_point_to_examine = []
for i in range(0, len(all_OD_trajectories_XY)):
trajectory = all_OD_trajectories_XY[i]
if (np.linalg.norm([ \
point_to_examine_XY[0] - trajectory[0][utils.data_dict_x_y_coordinate["x"]], \
point_to_examine_XY[1] - trajectory[0][utils.data_dict_x_y_coordinate["y"]]], 2) < utils.NEIGHBOURHOOD_ENDPOINT):
augmented_trajectories_from_point_to_examine_index.append(i)
augmented_trajectories_from_point_to_examine.append(trajectory)
# print "this found augmented_trajectories_from_point_to_examine_index:", \
# augmented_trajectories_from_point_to_examine_index, \
# "starting pos:", \
# trajectory[0][utils.data_dict_x_y_coordinate["x"]], \
# trajectory[0][utils.data_dict_x_y_coordinate["y"]]
print "all indexes (w.r.t all_OD_trajectories_XY) for this_empty_endpoint:", augmented_trajectories_from_point_to_examine_index
DEBUG_APPEND_INDEXS.append(augmented_trajectories_from_point_to_examine_index)
"""Append augmented_trajectories_from_point_to_examine to end of array of centroids and give extra label"""
for augmented_index in augmented_trajectories_from_point_to_examine_index:
if (not augmented_index in augmented_index_to_extra_label_mapping):
# if this normal trajectory is not appened, append it and mark in the augmented_index_to_extra_label_mapping
augmented_index_to_extra_label_mapping[augmented_index] = len(all_protocol_trajectories)
cluster_label_to_cluster_size[augmented_index_to_extra_label_mapping[augmented_index]] = 1
all_protocol_trajectories.append(\
convertListOfTrajectoriesToLatLon(reference_lat, reference_lon, \
[copy.deepcopy(all_OD_trajectories_XY[augmented_index])])[0])
else:
cluster_label_to_cluster_size[augmented_index_to_extra_label_mapping[augmented_index]] += 1
endpoints_cluster_dict[endpoint_str].append(utils.ClusterCentroidTuple(\
cluster = augmented_index_to_extra_label_mapping[augmented_index], \
centroid = all_protocol_trajectories[augmented_index_to_extra_label_mapping[augmented_index]]))
"""Asserting and Saving of info for Agent Based Simulator"""
assert (len(set([index for index_list in DEBUG_APPEND_INDEXS for index in index_list])) == \
len(all_protocol_trajectories) - len(set(opt_cluster_label))), \
"size of appended augmented trajectories should == len(DEBUG_APPEND_INDEXS)"
for index in range(0, len(all_protocol_trajectories)):
assert(index in cluster_label_to_cluster_size), "all_protocol_trajectories's index mapping to cluster should be complete"
for label, size in cluster_label_to_cluster_size.iteritems():
print "label, size:", label, size
print "number of endpoints that do not have clusters assigned to:", len(empty_endpoints)
print "total number of endpoints:", len(endpoints)
writeToCSV.writeDataToCSVWithMMSI(np.asarray(endpoints), root_folder + "/endpoints", "all_endpoints_with_MMSI")
writeToCSV.writeDataToCSV(np.asarray(empty_endpoints), root_folder + "/cleanedData", \
"non_starting_endpoints_{root_folder}_dissimilarity_l2_cophenetic_distance_cleaned".format(root_folder = root_folder))
writeToCSV.saveData([endpoints_cluster_dict], \
filename = root_folder + "/cleanedData" + "/endpoints_cluster_dict" + fname)
"""write all the all_protocol_trajectories for DEBUGGING purpose"""
for i in range(0, len(all_protocol_trajectories)):
protocol_trajectory = all_protocol_trajectories[i]
writeToCSV.writeDataToCSV(\
np.asarray(protocol_trajectory), \
utils.queryPath(root_folder + "/cleanedData/DEBUGGING/ALL_PROTOCOLS_PATTERN_ONLY"), \
"all_protocol_{i}".format(i = i))
"""Save related csv files for Agent Based Simulator"""
writeToCSV.writeAllProtocolTrajectories(\
path = utils.queryPath(root_folder+"LearningResult"), \
file_name = "protocol_trajectories_with_cluster_size", \
all_protocol_trajectories = all_protocol_trajectories, \
cluster_label_to_cluster_size = cluster_label_to_cluster_size)
writeToCSV.writeEndPointsToProtocolTrajectoriesIndexesWithMMSI(\
path = utils.queryPath(root_folder+"LearningResult"), \
file_name = "endpoints_to_protocol_trajectories", \
endpoints = endpoints, \
endpoints_cluster_dict = endpoints_cluster_dict)
def main():
root_folder = raw_input("Input the root_folder name:")
"""
Firstly, extract all .csv input file names from {root_folder}/input/*.csv
"""
# filenames = ["8514019.csv", "9116943.csv", "9267118.csv", "9443140.csv", "9383986.csv", "9343340.csv", "9417464.csv", "9664225.csv", "9538440.csv", "9327138.csv"]
# filenames = ["9664225.csv"]
# filenames = ["8514019.csv"]
filenames = []
for input_filename in os.listdir("{root_folder}/input/".format(root_folder = root_folder)):
if (input_filename.find(".csv") != -1):
filenames.append(input_filename)
"""
Get min distance between vessels
"""
need_compute_mindistance = raw_input("Need to compute min_distance_matrix for vessel interaction? (y/n) :") == 'y'
if (need_compute_mindistance):
"""sort the aggregateData with MMSI based on TS"""
data_with_mmsi = writeToCSV.readDataFromCSVWithMMSI(path = root_folder + "/cleanedData", filename = "aggregateData_with_mmsi.csv")
data_with_mmsi_sorted = compute_mindistance.sortDataBasedOnTS(data_with_mmsi)
writeToCSV.writeDataToCSVWithMMSI(data_with_mmsi_sorted, root_folder + "/cleanedData", "aggregateData_with_mmsi_sorted")
"""Apply the computing of min distance using a timed window"""
data_with_mmsi_sorted = writeToCSV.readDataFromCSVWithMMSI(path = root_folder + "/cleanedData", filename = "aggregateData_with_mmsi_sorted.csv")
mmsi_set = compute_mindistance.getSetOfMMSI(data_with_mmsi_sorted)
print mmsi_set
print list(mmsi_set)
start_time = time.time()
mmsi_list_dict, min_distance_matrix, vessel_distance_speed_dict = \
compute_mindistance.computeVesselMinDistanceMatrix(data_with_mmsi_sorted, TIME_WINDOW = 1800)
writeToCSV.saveData([{ \
'mmsi_list_dict': mmsi_list_dict, \
'min_distance_matrix': min_distance_matrix, \
'vessel_distance_speed_dict': vessel_distance_speed_dict
}], filename = root_folder + "/cleanedData" + "/min_distance_matrix_with_mmsi_time_window_1800_sec")
print "time spent:", time.time() - start_time
"""From already computed"""
# min_distance_matrix_result = writeToCSV.loadData(\
# root_folder + "/cleanedData" + "/min_distance_matrix_with_mmsi_time_window_1800_sec.npz")
# print "min_distance_matrix_result type:\n", type(min_distance_matrix_result)
# mmsi_list_dict = min_distance_matrix_result[0]["mmsi_list_dict"]
# min_distance_matrix = min_distance_matrix_result[0]["min_distance_matrix"]
# vessel_distance_speed_dict = min_distance_matrix_result[0]["vessel_distance_speed_dict"]
# print "min_distance_matrix loaded:\n", min_distance_matrix
# min_of_min_distance = sys.maxint
# for i in range(0, min_distance_matrix.shape[0]):
# for j in range(i + 1, min_distance_matrix.shape[1]):
# if (min_distance_matrix[i][j] < min_of_min_distance):
# min_of_min_distance = min_distance_matrix[i][j]
# print "min_distance_matrix min of 10 tankers:", min_of_min_distance
"""write min distance records for Agent Based Simulator"""
writeToCSV.writeVesselSpeedToDistance(\
path = utils.queryPath(root_folder+"LearningResult"),\
file_name = "vessel_speed_to_distance", \
vessel_distance_speed_dict = vessel_distance_speed_dict)
writeToCSV.writeVesselMinDistanceMatrix(\
path = utils.queryPath(root_folder+"LearningResult"), \
file_name = "vessel_min_distance_matrix", \
mmsi_list_dict = mmsi_list_dict, \
min_distance_matrix = min_distance_matrix)
writeToCSV.writeMMSIs(\
path = utils.queryPath(root_folder+"LearningResult"), \
file_name = "mmsi_list", \
mmsi_list = [key for key, index in mmsi_list_dict.iteritems()])
"""
Test Clustering
"""
# trajectories_to_cluster = writeToCSV.loadData(root_folder + "/" + "all_OD_trajectories_with_1D_data_refined.npz")
# # trajectories_to_cluster = writeToCSV.loadData(root_folder + "/" + "all_OD_trajectories_cleaned.npz")
# # trajectories_to_cluster = writeToCSV.loadData(root_folder + "/" + "all_OD_trajectories_9664225.npz")
# print "trajectories_to_cluster.shape: ", trajectories_to_cluster.shape
# print "type(trajectories_to_cluster): ", type(trajectories_to_cluster)
# print "len(trajectories_to_cluster): ", len(trajectories_to_cluster)
# # convert Lat, Lon to XY for clustering
# all_OD_trajectories_XY = convertListOfTrajectoriesToXY(utils.CENTER_LAT_SG, utils.CENTER_LON_SG, trajectories_to_cluster)
# executeClustering(root_folder = root_folder, \
# all_OD_trajectories_XY = all_OD_trajectories_XY, \
# reference_lat = utils.CENTER_LAT_SG, \
# reference_lon = utils.CENTER_LON_SG, \
# filenames = filenames)
# raise ValueError("purpose stop for testing clustering")
"""
plot out the value space of the features, speed, accelerations, etc, for the aggregateData
"""
# filename = "aggregateData.npz"
# path = "tankers/cleanedData"
# data = writeToCSV.loadArray("{p}/{f}".format(p = path, f=filename))
# for trajectory in trajectories_to_cluster:
# plotter.plotFeatureSpace(trajectory)
# raise ValueError("For plotting feature space only")
"""
Read the cleaned .csv input files form {root_folder}/cleanedData/
Extract endpoints
"""
endpoints = None
all_OD_trajectories = []
utils.queryPath("{root_folder}/endpoints".format(root_folder = root_folder))
utils.queryPath("{root_folder}/trajectories".format(root_folder = root_folder))
for i in range(0, len(filenames)):
this_vessel_trajectory_points = writeToCSV.readDataFromCSV(root_folder + "/cleanedData", filenames[i])
# Extract end points, along with MMSI
this_vessel_endpoints = np.asarray(extractEndPoints(writeToCSV.readDataFromCSVWithMMSI(root_folder + "/cleanedData", filenames[i])))
# Save end points, along with MMSI
writeToCSV.writeDataToCSVWithMMSI( \
this_vessel_endpoints, \
root_folder + "/endpoints", \
"{filename}_endpoints".format(filename = filenames[i][:filenames[i].find(".")]))
print "this_vessel_endpoints.shape:", this_vessel_endpoints.shape
# Append to the total end points
if(endpoints is None):
endpoints = this_vessel_endpoints
else:
endpoints = np.concatenate((endpoints, this_vessel_endpoints), axis=0)
for s in range (0, len(this_vessel_endpoints) - 1):
originLatitude = this_vessel_endpoints[s][utils.dataDict["latitude"]]
originLongtitude = this_vessel_endpoints[s][utils.dataDict["longitude"]]
origin_ts = this_vessel_endpoints[s][utils.dataDict["ts"]]
endLatitude = this_vessel_endpoints[s + 1][utils.dataDict["latitude"]]
endLongtitude = this_vessel_endpoints[s + 1][utils.dataDict["longitude"]]
end_ts = this_vessel_endpoints[s + 1][utils.dataDict["ts"]]
"""Extracting trajectory between a pair of OD"""
print "\n\nextracting endpoints between ", s, " and ", s + 1
OD_trajectories, OD_trajectories_lat_lon = extractTrajectoriesUntilOD(\
this_vessel_trajectory_points, \
origin_ts, \
originLatitude, \
originLongtitude, \
end_ts, \
endLatitude, \
endLongtitude, \
show = False, save = True, clean = False, \
fname = filenames[i][:filenames[i].find(".")] + "_trajectory_between_endpoint{s}_and{e}".format(s = s, e = s + 1))
# there will be one trajectory between each OD
assert (len(OD_trajectories) > 0), "OD_trajectories extracted must have length > 0"
print "number of trajectory points extracted : ", len(OD_trajectories[0])
if(len(OD_trajectories[0]) > 2): # more than just the origin and destination endpoints along the trajectory
writeToCSV.writeDataToCSV( \
data = OD_trajectories_lat_lon[0],
path = root_folder + "/trajectories", \
file_name = "{filename}_trajectory_endpoint_{s}_to_{e}".format(filename = filenames[i][:filenames[i].find(".")], \
s = s, \
e = s + 1))
"""
Interpolation based on pure geographical trajectory, ignore temporal information
"""
interpolated_OD_trajectories = interpolator.geographicalTrajetoryInterpolation(OD_trajectories)
plotter.plotListOfTrajectories( \
interpolated_OD_trajectories, \
show = False, \
clean = True, \
save = True, \
fname = filenames[i][:filenames[i].find(".")] + "_interpolated_algo_3_between_endpoint{s}_and{e}".format(\
s = s, \
e = s + 1))
"""
Interpolation of 1D data: speed, rate_of_turn, etc; interpolated_OD_trajectories / OD_trajectories are both in X, Y coordinates
"""
if(len(interpolated_OD_trajectories) > 0):
interpolated_OD_trajectories[0] = interpolator.interpolate1DFeatures( \
interpolated_OD_trajectories[0], \
OD_trajectories[0])
# change X, Y coordinate to Lat, Lon
interpolated_OD_trajectories_lat_lon = convertListOfTrajectoriesToLatLon( \
originLatitude, originLongtitude, interpolated_OD_trajectories)
if(len(interpolated_OD_trajectories_lat_lon) > 0):
# since there should be only one trajectory between each pair of OD
all_OD_trajectories.append(interpolated_OD_trajectories_lat_lon[0])
else:
print "no trajectories extracted between endpoints ", s , " and ", s + 1
plt.clf()
assert (not endpoints is None), "Error!: No endpoints extracted from the historial data of vessels" + "_".join(filenames)
print "Final endpoints.shape:", endpoints.shape
print "number of interpolated all_OD_trajectories:", len(all_OD_trajectories)
"""
save the augmented trajectories between endpoints as npz data file and the plot
"""
# remove error trajectories that are too far from Singapore
all_OD_trajectories = utils.removeErrorTrajectoryFromList(all_OD_trajectories)
writeToCSV.saveData(all_OD_trajectories, root_folder + "/all_OD_trajectories_with_1D_data")
# convert Lat, Lon to XY for displaying
all_OD_trajectories_XY = convertListOfTrajectoriesToXY(utils.CENTER_LAT_SG, utils.CENTER_LON_SG, all_OD_trajectories)
plotter.plotListOfTrajectories(all_OD_trajectories_XY, show = False, clean = True, save = True, \
fname = "{root_folder}_all_OD_trajectories".format(root_folder = root_folder))
"""
Execute Clustering
"""
executeClustering(root_folder = root_folder, \
all_OD_trajectories_XY = all_OD_trajectories_XY, \
reference_lat = utils.CENTER_LAT_SG, \
reference_lon = utils.CENTER_LON_SG, \
filenames = filenames)
if __name__ == "__main__":
main()