def test_haversine_dist(self): """ input_latlng saves different combinations of haversine-distances in meters and the longitude & latitudes from two different points in WGS84 References ------ https://community.esri.com/groups/coordinate-reference-systems/blog/2017/10/05/haversine-formula """ # {haversine-distance in meters[longitude_P1, latitudes_P1, longitude_P2, latitudes_P2]} input_latlng = { 18749: [8.5, 47.3, 8.7, 47.2], # Source: see Information to function 5897658.289: [-0.116773, 51.510357, -77.009003, 38.889931], 3780627: [0.0, 4.0, 0.0, 38], # Source for next lines: self-computation with formula from link above 2306879.363: [-7.345, -7.345, 7.345, 7.345], 13222121.519: [-0.118746, 73.998, -120.947783, -21.4783], 785767.221: [50, 0, 45, 5] } for haversine, latlng in input_latlng.items(): haversine_output = haversine_dist(latlng[0], latlng[1], latlng[2], latlng[3]) assert np.isclose(haversine_output, haversine, atol=0.1)
def test_haversine_vectorized(self): spts = ti.read_staypoints_csv( os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv')) x = spts.geometry.x.values y = spts.geometry.y.values n = len(x) # our distance ix_1, ix_2 = np.triu_indices(n, k=1) x1 = x[ix_1] y1 = y[ix_1] x2 = x[ix_2] y2 = y[ix_2] d_ours = haversine_dist(x1, y1, x2, y2) # their distance x_rad = np.asarray([radians(_) for _ in x]) y_rad = np.asarray([radians(_) for _ in y]) yx = np.concatenate((y_rad.reshape(-1, 1), x_rad.reshape(-1, 1)), axis=1) D_theirs = haversine_distances(yx, yx) * 6371000 d_theirs = D_theirs[ix_1, ix_2] assert np.sum( np.abs(d_ours - d_theirs)) < 0.01 # 1cm for 58 should be good enough
def identify_mode(tripleg, wgs, categories): """ Identify the mode based on the (overall) tripleg speed. Parameters ---------- tripleg : trackintel triplegs GeoDataFrame the tripleg to analyse wgs : bool whether the tripleg is in WGS84 or not. categories : dict the upper boundaries (as keys) and the names of the categories as values. Returns ------- str the identified mode. """ # Computes distance over whole tripleg geometry (using the Haversine distance). if wgs: distance = sum([haversine_dist(pt1[0], pt1[1], pt2[0], pt2[1]) for pt1, pt2 in zip(tripleg.geom.coords[:-1], tripleg.geom.coords[1:])]) else: distance = tripleg.geom.length duration = (tripleg['finished_at'] - tripleg['started_at']).total_seconds() speed = distance / duration # The unit of the speed is m/s for bound in categories: if speed < bound: return categories[bound]
def test_example_from_sklean(self): bsas = [-34.83333, -58.5166646] paris = [49.0083899664, 2.53844117956] bsas_in_radians = [radians(_) for _ in bsas] paris_in_radians = [radians(_) for _ in paris] d_theirs = haversine_distances([bsas_in_radians, paris_in_radians]) * 6371000 d_ours = haversine_dist(bsas[1], bsas[0], paris[1], paris[0]) assert np.abs(d_theirs[1][0] - d_ours) < 0.01
def findStayPoints(locs, dataName, accuracy_threshold, dist_threshold, time_threshold, timemax_threshold): """ Finds the staypoints from the raw locations, following Li's algorithm with the help of trackintel Parameters ---------- locs : gdf - raw points as a geopandas df dataName : str - ID of participant accuracy_threshold, dist_threshold, time_threshold, timemax_threshold : float - Different thresholds needed in the analysis Returns ------- pds : gdf - positionfixes, also the raw locations but in the format of trackintel stps : gdf - staypoints, found by the algorithm of trackintel """ # Calculate time and distance difference if True: locs['d_diff'] = np.append( haversine_dist(locs.longitudeE7[1:], locs.latitudeE7[1:], locs.longitudeE7[:-1], locs.latitudeE7[:-1]), 0) locs = locs[locs['accuracy'] < accuracy_threshold] #locs = locs[locs['accuracy']<locs['d_diff']] if not (os.path.exists('../data/results/shp/' + dataName + '/')): os.makedirs('../data/results/shp/' + dataName + '/') hlp.loc2csv4ti(locs, dataName) pfs = ti.read_positionfixes_csv('../data/results/csv/' + dataName + '/' + dataName + '.csv', sep=';') # Find staypoints using a slightly modified version of the trackintel script stps = tim.extract_staypoints_ipa(pfs, method='sliding', dist_threshold=dist_threshold, time_threshold=time_threshold, timemax_threshold=timemax_threshold) return pfs, stps
def stydiffstat(dataNameList, SELECT_RANGE, dateStart, dateEnd): """ Return the place name of input places Parameters ---------- dataNameList : list - list of strings of all participant id with shared data SELECT_RANGE: var - flag to define if select certain period dateStart: str - the start date of the period if selecting certain period dateEnd: str - the end date of the period if selecting certain period Returns ------- staythredstat: useful statistics to semi-automatically choose thresholds """ ddiff_max = [] ddiff_min = [] ddiff_mean = [] ddiff_median = [] ddiff_quar = [] tdiff_max = [] tdiff_min = [] tdiff_mean = [] tdiff_median = [] tdiff_quar = [] for dataName in dataNameList: dataPathLocs, dataPathTrips = hlp.getDataPaths(dataName) if SELECT_RANGE: dataPathLocs, dataPathTrips = hlp.selectRange( dataPathLocs, dataPathTrips, dateStart, dateEnd) locs, locsgdf = hlp.parseLocs(dataPathLocs) locs['d_diff'] = np.append( haversine_dist(locs.longitudeE7[1:], locs.latitudeE7[1:], locs.longitudeE7[:-1], locs.latitudeE7[:-1]), 0) accuracy_threshold = np.quantile(locs['d_diff'], .95) locs['t_diff'] = np.append( (locs.index[1:] - locs.index[:-1]).total_seconds(), 0) maxi = max(locs['d_diff']) ddiff_max.append(maxi) mini = min(locs['d_diff']) ddiff_min.append(mini) meani = np.mean(locs['d_diff']) ddiff_mean.append(meani) mediani = np.median(locs['d_diff']) ddiff_median.append(mediani) quari = np.quantile(locs['d_diff'], .25) ddiff_quar.append(quari) maxi = max(locs['t_diff']) tdiff_max.append(maxi) mini = min(locs['t_diff']) tdiff_min.append(mini) meani = np.mean(locs['t_diff']) tdiff_mean.append(meani) mediani = np.median(locs['t_diff']) tdiff_median.append(mediani) quari = np.quantile(locs['t_diff'], .25) tdiff_quar.append(quari) ddiff_max = np.array(ddiff_max) ddiff_max = np.transpose(ddiff_max) ddiff_min = np.array(ddiff_min) ddiff_min = np.transpose(ddiff_min) ddiff_mean = np.array(ddiff_mean) ddiff_mean = np.transpose(ddiff_mean) ddiff_median = np.array(ddiff_median) ddiff_median = np.transpose(ddiff_median) ddiff_quar = np.array(ddiff_quar) ddiff_quar = np.transpose(ddiff_quar) tdiff_max = np.array(tdiff_max) tdiff_max = np.transpose(tdiff_max) tdiff_min = np.array(tdiff_min) tdiff_min = np.transpose(tdiff_min) tdiff_mean = np.array(tdiff_mean) tdiff_mean = np.transpose(tdiff_mean) tdiff_median = np.array(tdiff_median) tdiff_median = np.transpose(tdiff_median) tdiff_quar = np.array(tdiff_quar) tdiff_quar = np.transpose(tdiff_quar) thredstat = { 'dataName': np.array(dataNameList), 'dist_max': ddiff_max, 'dist_min': ddiff_min, 'dist_range': ddiff_max - ddiff_min, 'dist_mean': ddiff_mean, 'dist_median': ddiff_median, 'dist_quarter': ddiff_quar, 'time_max': tdiff_max, 'time_min': tdiff_min, 'time_range': tdiff_max - tdiff_min, 'time_mean': tdiff_mean, 'time_median': tdiff_median, 'time_quarter': tdiff_quar } staythredstat = pd.DataFrame(thredstat) return staythredstat
dataPathLocs, dataPathTrips = hlp.selectRange( dataPathLocs, dataPathTrips, mac, dateStart=thresholds["dateStart"], dateEnd=thresholds["dateEnd"], ) locs, locsgdf = hlp.parseLocs(dataPathLocs) trips, tripdf, tripsgdf = hlp.parseTrips(dataPathTrips) # add location data to the trips file (not used now because only for visualization of google results) # tripsgdf = hlp.parseTripsWithLocs(dataPathTrips, locsgdf) locs['d_diff'] = np.append( haversine_dist(locs.longitudeE7[1:], locs.latitudeE7[1:], locs.longitudeE7[:-1], locs.latitudeE7[:-1]), 0) if CHOOSE_THRES: thresholds['accuracy_threshold'] = np.quantile(locs['d_diff'], .95) # export to shapefile if exportShp: hlp.loc2shp(locsgdf, dataName) hlp.trip2shp(tripsgdf, dataName) #%% FIND STAY POINTS if FIND_STAY_POINTS: print("-> Finding stay points ") pfs, stps = main.findStayPoints(locs, dataName, thresholds["accuracy_threshold"], thresholds["dist_threshold"], thresholds["time_threshold"],
def extract_staypoints_ipa(positionfixes, method='sliding', dist_threshold=50, time_threshold=5 * 60, timemax_threshold=12 * 3600, epsilon=100, dist_func=haversine_dist, eps=None, num_samples=None): """Extract staypoints from positionfixes. This function modifies the positionfixes and adds staypoint_ids. Parameters ---------- num_samples eps positionfixes : GeoDataFrame The positionfixes have to follow the standard definition for positionfixes DataFrames. method : {'sliding' or 'dbscan'} The following methods are available to extract staypoints from positionfixes: 'sliding' : Applies a sliding window over the data. 'dbscan' : Uses the DBSCAN algorithm to find clusters of staypoints. dist_threshold : float The distance threshold for the 'sliding' method, i.e., how far someone has to travel to generate a new staypoint. time_threshold : float The time threshold for the 'sliding' method in seconds, i.e., how long someone has to stay within an area to consider it as a staypoint. epsilon : float The epsilon for the 'dbscan' method. dist_func : function A function that expects (lon_1, lat_1, lon_2, lat_2) and computes a distance in meters. Returns ------- GeoDataFrame A new GeoDataFrame containing points where a person spent some time. Examples -------- >>> psfs.as_positionfixes.extract_staypoints('sliding', dist_threshold=100) References ---------- Zheng, Y. (2015). Trajectory data mining: an overview. ACM Transactions on Intelligent Systems and Technology (TIST), 6(3), 29. Li, Q., Zheng, Y., Xie, X., Chen, Y., Liu, W., & Ma, W. Y. (2008, November). Mining user similarity based on location history. In Proceedings of the 16th ACM SIGSPATIAL international conference on Advances in geographic information systems (p. 34). ACM. """ if 'id' not in positionfixes.columns: positionfixes['id'] = positionfixes.index ret_staypoints = pd.DataFrame( columns=['started_at', 'finished_at', 'geom', 'id']) if method == 'sliding': # Algorithm from Li et al. (2008). For details, please refer to the paper. staypoint_id_counter = 0 positionfixes[ 'staypoint_id'] = -1 # this marks all that are not part of a SP for user_id_this in positionfixes['user_id'].unique(): positionfixes_user_this = positionfixes.loc[ positionfixes['user_id'] == user_id_this] # this is no copy pfs = positionfixes_user_this.sort_values('tracked_at').to_dict( 'records') num_pfs = len(pfs) posfix_staypoint_matching = {} i = 0 j = 0 # is zero because it gets incremented in the beginning while i < num_pfs: if j == num_pfs: # We're at the end, this can happen if in the last "bin", # the dist_threshold is never crossed anymore. break else: j = i + 1 while j < num_pfs: dist = haversine_dist(pfs[i]['geom'].x, pfs[i]['geom'].y, pfs[j]['geom'].x, pfs[j]['geom'].y) if (dist > dist_threshold): delta_t = (pfs[j]['tracked_at'] - pfs[i]['tracked_at']).total_seconds() # Compare with the maximum time threshold if (delta_t > time_threshold): if (delta_t > timemax_threshold): hrdiff = [] hrsum = 0 for x in range(i, j): hrdiff.append( (pfs[x + 1]['tracked_at'] - pfs[x]['tracked_at']).total_seconds()) i0 = i for mid in range(0, j - i0): hrsum += hrdiff[mid] if (hrsum > timemax_threshold or mid == j - i0 - 1): staypoint = {} staypoint['user_id'] = pfs[i][ 'user_id'] staypoint['geom'] = Point( np.mean([ pfs[k]['geom'].x for k in range( i, i0 + mid + 1) ]), np.mean([ pfs[k]['geom'].y for k in range( i, i0 + mid + 1) ])) if 'elevation' in pfs[i].keys(): staypoint['elevation'] = np.mean([ pfs[k]['elevation'] for k in range( i, i0 + mid + 1) ]) if 'velocity' in pfs[i].keys(): staypoint['velocity'] = np.mean([ pfs[k]['velocity'] for k in range( i, i0 + mid + 1) ]) staypoint['started_at'] = pfs[i][ 'tracked_at'] staypoint['finished_at'] = pfs[ i0 + mid + 1]['tracked_at'] # TODO: should this not be j-1? because j is not part of the staypoint. DB: Changed staypoint['id'] = staypoint_id_counter # store matching posfix_staypoint_matching[ staypoint_id_counter] = [ pfs[k]['id'] for k in range( i, i0 + mid + 1) ] staypoint_id_counter += 1 # add staypoint ret_staypoints = ret_staypoints.append( staypoint, ignore_index=True) i = i0 + mid + 1 hrsum = 0 else: staypoint = {} staypoint['user_id'] = pfs[i]['user_id'] staypoint['geom'] = Point( np.mean([ pfs[k]['geom'].x for k in range(i, j) ]), np.mean([ pfs[k]['geom'].y for k in range(i, j) ])) if 'elevation' in pfs[i].keys(): staypoint['elevation'] = np.mean([ pfs[k]['elevation'] for k in range(i, j) ]) if 'velocity' in pfs[i].keys(): staypoint['velocity'] = np.mean([ pfs[k]['velocity'] for k in range(i, j) ]) staypoint['started_at'] = pfs[i]['tracked_at'] staypoint['finished_at'] = pfs[j][ 'tracked_at'] # TODO: should this not be j-1? because j is not part of the staypoint. DB: Changed staypoint['id'] = staypoint_id_counter # store matching posfix_staypoint_matching[ staypoint_id_counter] = [ pfs[k]['id'] for k in range(i, j) ] staypoint_id_counter += 1 # add staypoint ret_staypoints = ret_staypoints.append( staypoint, ignore_index=True) # TODO Discussion: Is this last point really a staypoint? As we don't know if the # person "moves on" afterwards... i = j break # If the last point meets the minimum time threshold, then it is added to the stay point if (j == num_pfs - 1): delta_t = (pfs[j]['tracked_at'] - pfs[i]['tracked_at']).total_seconds() if (delta_t > time_threshold): if (delta_t > timemax_threshold): hrdiff = [] hrsum = 0 for x in range(i, j): hrdiff.append( (pfs[x + 1]['tracked_at'] - pfs[x]['tracked_at']).total_seconds()) i0 = i for mid in range(0, j - i0): hrsum += hrdiff[mid] if (hrsum > timemax_threshold or mid == j - i0 - 1): staypoint = {} staypoint['user_id'] = pfs[i][ 'user_id'] staypoint['geom'] = Point( np.mean([ pfs[k]['geom'].x for k in range( i, i0 + mid + 1) ]), np.mean([ pfs[k]['geom'].y for k in range( i, i0 + mid + 1) ])) if 'elevation' in pfs[i].keys(): staypoint['elevation'] = np.mean([ pfs[k]['elevation'] for k in range( i, i0 + mid + 1) ]) if 'velocity' in pfs[i].keys(): staypoint['velocity'] = np.mean([ pfs[k]['velocity'] for k in range( i, i0 + mid + 1) ]) staypoint['started_at'] = pfs[i][ 'tracked_at'] staypoint['finished_at'] = pfs[ i0 + mid + 1]['tracked_at'] # TODO: should this not be j-1? because j is not part of the staypoint. DB: Changed staypoint['id'] = staypoint_id_counter # store matching posfix_staypoint_matching[ staypoint_id_counter] = [ pfs[k]['id'] for k in range( i, i0 + mid + 1) ] staypoint_id_counter += 1 # add staypoint ret_staypoints = ret_staypoints.append( staypoint, ignore_index=True) i = i0 + mid + 1 hrsum = 0 else: staypoint = {} staypoint['user_id'] = pfs[i]['user_id'] staypoint['geom'] = Point( np.mean([ pfs[k]['geom'].x for k in range(i, j + 1) ]), np.mean([ pfs[k]['geom'].y for k in range(i, j + 1) ])) if 'elevation' in pfs[i].keys(): staypoint['elevation'] = np.mean([ pfs[k]['elevation'] for k in range(i, j + 1) ]) if 'velocity' in pfs[i].keys(): staypoint['velocity'] = np.mean([ pfs[k]['velocity'] for k in range(i, j + 1) ]) staypoint['started_at'] = pfs[i]['tracked_at'] staypoint['finished_at'] = pfs[j][ 'tracked_at'] # TODO: should this not be j-1? because j is not part of the staypoint. DB: Changed. staypoint['id'] = staypoint_id_counter # store matching posfix_staypoint_matching[ staypoint_id_counter] = [ pfs[k]['id'] for k in range(i, j + 1) ] # posfix_staypoint_matching[staypoint_id_counter] = [ # j] # rather [k for k in range(i, j)]? staypoint_id_counter += 1 ret_staypoints = ret_staypoints.append( staypoint, ignore_index=True) j = j + 1 # add matching to original positionfixes (for every user) for staypoints_id, posfix_idlist in posfix_staypoint_matching.items( ): # note that we use .loc because above we have saved the id # of the positionfixes not thier absolut position positionfixes.loc[posfix_idlist, 'staypoint_id'] = staypoints_id elif method == 'dbscan': # TODO: Make sure time information is included in the clustering! # time information is in the column 'started at', however the user should be able to # adjust the distance metric e.g. chebychev db = DBSCAN(eps=epsilon, min_samples=num_samples) for user_id_this in positionfixes['user_id'].unique(): user_positionfixes = positionfixes[positionfixes[ 'user_id'] == user_id_this] # this is not a copy! # TODO: enable transformations to temporary (metric) system transform_crs = None if transform_crs is not None: pass # get staypoint matching coordinates = np.array([[g.x, g.y] for g in user_positionfixes['geom']]) labels = db.fit_predict(coordinates) # add positionfixes - staypoint matching to original positionfixes positionfixes.loc[user_positionfixes.index, 'staypoint_id'] = labels # create staypoints as the center of the grouped positionfixes grouped_df = positionfixes.groupby(['user_id', 'staypoint_id']) for combined_id, group in grouped_df: user_id, staypoint_id = combined_id if int(staypoint_id) != -1: staypoint = {} staypoint['user_id'] = user_id staypoint['id'] = staypoint_id # point geometry of staypoint staypoint['geom'] = Point(group.geometry.x.mean(), group.geometry.y.mean()) ret_staypoints = ret_staypoints.append(staypoint, ignore_index=True) ret_staypoints = gpd.GeoDataFrame(ret_staypoints, geometry='geom', crs=positionfixes.crs) ret_staypoints['id'] = ret_staypoints['id'].astype('int') return ret_staypoints