def test_keyword_combinations(self): spts_file = os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv') spts = ti.read_staypoints_csv(spts_file, tz='utc', index_col='id') x = spts.iloc[0:5] y = spts.iloc[5:15] _ = calculate_distance_matrix(X=x, Y=y, dist_metric='euclidean', n_jobs=-1) _ = calculate_distance_matrix(X=y, Y=x, dist_metric='haversine', n_jobs=-1) d_mink1 = calculate_distance_matrix(X=x, Y=x, dist_metric='minkowski', p=1) d_mink2 = calculate_distance_matrix(X=x, Y=x, dist_metric='minkowski', p=2) d_euc = calculate_distance_matrix(X=x, Y=x, dist_metric='euclidean') assert not np.array_equal(d_mink1, d_mink2) assert np.array_equal(d_euc, d_mink2)
def test_keyword_combinations(self): stps_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") stps = ti.read_staypoints_csv(stps_file, tz="utc", index_col="id") x = stps.iloc[0:5] y = stps.iloc[5:15] _ = calculate_distance_matrix(X=x, Y=y, dist_metric="euclidean", n_jobs=-1) _ = calculate_distance_matrix(X=y, Y=x, dist_metric="haversine", n_jobs=-1) d_mink1 = calculate_distance_matrix(X=x, Y=x, dist_metric="minkowski", p=1) d_mink2 = calculate_distance_matrix(X=x, Y=x, dist_metric="minkowski", p=2) d_euc = calculate_distance_matrix(X=x, Y=x, dist_metric="euclidean") assert not np.array_equal(d_mink1, d_mink2) assert np.array_equal(d_euc, d_mink2)
def test_trajectory_distance(self): tpls = ti.read_triplegs_csv( os.path.join('tests', 'data', 'geolife', 'geolife_triplegs.csv')) D_single = calculate_distance_matrix(X=tpls.iloc[0:4], dist_metric='dtw', n_jobs=1) D_multi = calculate_distance_matrix(X=tpls.iloc[0:4], dist_metric='dtw', n_jobs=4) assert np.isclose(np.sum(np.abs(D_single - D_multi)), 0)
def test_trajectory_distance_frechet(self, geolife_tpls): """Calculate Linestring length using frechet, single and multi core.""" tpls = geolife_tpls D_single = calculate_distance_matrix(X=tpls.iloc[0:4], dist_metric="frechet", n_jobs=1) D_multi = calculate_distance_matrix(X=tpls.iloc[0:4], dist_metric="frechet", n_jobs=4) assert np.isclose(np.sum(np.abs(D_single - D_multi)), 0)
def test_distance_error(self, single_linestring): """Test if the an error is raised when wrong geometry is passed.""" # construct a gdf with two MultiLineStrings multi = MultiLineString([single_linestring, single_linestring]) a_list = [(0, multi), (1, multi)] gdf = gpd.GeoDataFrame(a_list, columns=["id", "geometry"]).set_geometry("geometry") gdf = gdf.set_crs("wgs84") with pytest.raises(AttributeError): calculate_distance_matrix(X=gdf, dist_metric="dtw", n_jobs=1)
def test_shape_for_different_array_length(self): spts = ti.read_staypoints_csv( os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv')) x = spts.iloc[0:5] y = spts.iloc[5:15] d_euc1 = calculate_distance_matrix(X=x, Y=y, dist_metric='euclidean') d_euc2 = calculate_distance_matrix(X=y, Y=x, dist_metric='euclidean') d_hav1 = calculate_distance_matrix(X=x, Y=y, dist_metric='haversine') d_hav2 = calculate_distance_matrix(X=y, Y=x, dist_metric='haversine') assert d_euc1.shape == d_hav1.shape == (5, 10) assert d_euc2.shape == d_hav2.shape == (10, 5) assert np.isclose(0, np.sum(np.abs(d_euc1 - d_euc2.T))) assert np.isclose(0, np.sum(np.abs(d_hav1 - d_hav2.T)))
def test_shape_for_different_array_length(self): stps_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") stps = ti.read_staypoints_csv(stps_file, tz="utc", index_col="id") x = stps.iloc[0:5] y = stps.iloc[5:15] d_euc1 = calculate_distance_matrix(X=x, Y=y, dist_metric="euclidean") d_euc2 = calculate_distance_matrix(X=y, Y=x, dist_metric="euclidean") d_hav1 = calculate_distance_matrix(X=x, Y=y, dist_metric="haversine") d_hav2 = calculate_distance_matrix(X=y, Y=x, dist_metric="haversine") assert d_euc1.shape == d_hav1.shape == (5, 10) assert d_euc2.shape == d_hav2.shape == (10, 5) assert np.isclose(0, np.sum(np.abs(d_euc1 - d_euc2.T))) assert np.isclose(0, np.sum(np.abs(d_hav1 - d_hav2.T)))
def test_trajectory_distance_dtw(self, geolife_tpls): """Calculate Linestring length using dtw, single and multi core.""" tpls = geolife_tpls D_all = calculate_distance_matrix(X=tpls.iloc[0:4], dist_metric="dtw", n_jobs=-1) D_zero = calculate_distance_matrix(X=tpls.iloc[0:4], dist_metric="dtw", n_jobs=0) D_single = calculate_distance_matrix(X=tpls.iloc[0:4], dist_metric="dtw", n_jobs=1) D_multi = calculate_distance_matrix(X=tpls.iloc[0:4], dist_metric="dtw", n_jobs=4) assert np.isclose(np.sum(np.abs(D_single - D_multi)), 0) assert np.isclose(np.sum(np.abs(D_all - D_multi)), 0) assert np.isclose(np.sum(np.abs(D_zero - D_multi)), 0)
def test_dbscan_haversine(self): """Test haversine dbscan location result with manually calling the DBSCAN method.""" stps_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") stps = ti.read_staypoints_csv(stps_file, tz="utc", index_col="id") # haversine calculation using sklearn.metrics.pairwise_distances stps, locs = stps.as_staypoints.generate_locations( method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset" ) # calculate pairwise haversine matrix and fed to dbscan sp_distance_matrix = calculate_distance_matrix(stps, dist_metric="haversine") db = DBSCAN(eps=10, min_samples=0, metric="precomputed") labels = db.fit_predict(sp_distance_matrix) assert len(set(locs.index)) == len(set(labels))
def test_compare_haversine_to_scikit_xy(self): spts = ti.read_staypoints_csv( os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv')) our_d_matrix = calculate_distance_matrix(X=spts, Y=spts, dist_metric='haversine') x = spts.geometry.x.values y = spts.geometry.y.values x_rad = np.asarray([radians(_) for _ in x]) y_rad = np.asarray([radians(_) for _ in y]) yx = np.concatenate((y_rad.reshape(-1, 1), x_rad.reshape(-1, 1)), axis=1) their_d_matrix = pairwise_distances(yx, metric='haversine') * 6371000 assert np.allclose(np.abs(our_d_matrix - their_d_matrix), 0, atol=0.001) # atol = 1mm
def test_cluster_staypoints_dbscan_haversine(self): spts = ti.read_staypoints_csv( os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv')) # haversine calculation using sklearn.metrics.pairwise_distances, epsilon converted to radius spts, locs = spts.as_staypoints.extract_locations( method='dbscan', epsilon=10, num_samples=0, distance_matrix_metric='haversine', agg_level='dataset') # calculate pairwise haversine matrix and fed to dbscan sp_distance_matrix = calculate_distance_matrix(spts, dist_metric="haversine") db = DBSCAN(eps=10, min_samples=0, metric="precomputed") labels = db.fit_predict(sp_distance_matrix) assert len(set(locs['location_id'])) == len( set(labels)), "The #location should be the same"
def test_compare_haversine_to_scikit_xy(self): stps_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") stps = ti.read_staypoints_csv(stps_file, tz="utc", index_col="id") our_d_matrix = calculate_distance_matrix(X=stps, Y=stps, dist_metric="haversine") x = stps.geometry.x.values y = stps.geometry.y.values x_rad = np.asarray([radians(_) for _ in x]) y_rad = np.asarray([radians(_) for _ in y]) yx = np.concatenate((y_rad.reshape(-1, 1), x_rad.reshape(-1, 1)), axis=1) their_d_matrix = pairwise_distances(yx, metric="haversine") * 6371000 assert np.allclose(np.abs(our_d_matrix - their_d_matrix), 0, atol=0.001) # atol = 1mm
def test_generate_locations_dbscan_haversine(self): stps_file = os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv') stps = ti.read_staypoints_csv(stps_file, tz='utc', index_col='id') # haversine calculation using sklearn.metrics.pairwise_distances stps, locs = stps.as_staypoints.generate_locations( method='dbscan', epsilon=10, num_samples=0, distance_matrix_metric='haversine', agg_level='dataset') # calculate pairwise haversine matrix and fed to dbscan sp_distance_matrix = calculate_distance_matrix(stps, dist_metric="haversine") db = DBSCAN(eps=10, min_samples=0, metric="precomputed") labels = db.fit_predict(sp_distance_matrix) assert len(set(locs.index)) == len( set(labels)), "The number of locations should be the same"
def weights_delaunay(locations, to_crs=None, distance_matrix_metric='haversine', adjacency_dict=None): all_users = locations["user_id"].unique() if adjacency_dict is None: adjacency_dict = {} sorted_locs = locations.set_index('user_id', drop=False) sorted_locs.index.name = 'user_id_ix' sorted_locs.sort_index(inplace=True) for user_id_this in all_users: user_locs = sorted_locs[sorted_locs.index == user_id_this] org_ixs = user_locs['location_id'].values loc_id_order = org_ixs edge_name = 'delaunay' if to_crs is not None: geometry = user_locs['center'].to_crs(to_crs) points = list(zip(geometry.x, geometry.y)) else: try: points = list(zip(locations['long'], locations['lat'])) except KeyError: geometry = user_locs['center'] points = list(zip(geometry.x, geometry.y)) # import point data as xy coordinates # nx graph from scipy.spatial.Delaunay: # https://groups.google.com/forum/#!topic/networkx-discuss/D7fMmuzVBAw # -------------------------------------- # make a Delaunay triangulation of the point data try: delTri = scipy.spatial.Delaunay(points) # create a set for edges that are indexes of the points edges = set() # for each Delaunay triangle for n in range(delTri.nsimplex): # for each edge of the triangle # sort the vertices # (sorting avoids duplicated edges being added to the set) # and add to the edges set edge = sorted([delTri.vertices[n, 0], delTri.vertices[n, 1]]) edges.add((edge[0], edge[1])) edge = sorted([delTri.vertices[n, 0], delTri.vertices[n, 2]]) edges.add((edge[0], edge[1])) edge = sorted([delTri.vertices[n, 1], delTri.vertices[n, 2]]) edges.add((edge[0], edge[1])) # add distances to edges locs_distance_matrix = calculate_distance_matrix( user_locs, dist_metric=distance_matrix_metric) # invert distance matrix, so that close places have a high weight locs_distance_matrix = np.reciprocal( locs_distance_matrix, where=locs_distance_matrix != 0) edges = [(u, v, locs_distance_matrix[u, v]) for u, v in edges] row_ixs, col_ixs, values = map(list, zip(*edges)) # enforce symmetry: col_ixs_temp = col_ixs.copy() col_ixs = col_ixs + row_ixs row_ixs = row_ixs + col_ixs_temp values = values + values # create adjacency matrix shape = locs_distance_matrix.shape A = coo_matrix((values, (row_ixs, col_ixs)), shape=shape) except QhullError: A = coo_matrix((0, 0)) loc_id_order = np.asarray([]) if user_id_this not in adjacency_dict: adjacency_dict[user_id_this] = { 'A': [A], 'loc_id_order': [loc_id_order], 'edge_name': [edge_name] } else: adjacency_dict[user_id_this]['A'].append(A) adjacency_dict[user_id_this]['loc_id_order'].append(loc_id_order) adjacency_dict[user_id_this]['edge_name'].append(edge_name) return adjacency_dict
def weights_n_neighbors(locations, n=None, distance_matrix_metric='haversine', adjacency_dict=None): """ Calculate the distance of the n nearest locations as graph weights. Graphs based on the activity locations (trackintel locations) can have several types of weighted edges. This function calculates the edge weight based on the distance to the n closest neighbors (locations) of the same user. Parameters ---------- locations: GeoDataFrame n: int number of nearst locations to take into account distance_matrix_metric: String can be Returns ------- distance_matrix_metric: string The distance metric used to calculate the distance between locations. Uses the Trackintel.geogr.distances.calculate_distance_matrix() function. Possible metrics are: {'haversine', 'euclidean'} or any mentioned in: https://scikit-learn.org/stable/modules/generated/ sklearn.metrics.pairwise_distances.html """ # todo: check if cluster id is missing # todo: check if adjacency matrix is symmetric? # todo: What if n is too large? all_users = locations["user_id"].unique() if adjacency_dict is None: adjacency_dict = {} sorted_locs = locations.set_index('user_id', drop=False) sorted_locs.index.name = 'user_id_ix' sorted_locs.sort_index(inplace=True) for user_id_this in all_users: row_ixs = [] col_ixs = [] values = [] user_locs = sorted_locs[sorted_locs.index == user_id_this] locs_distance_matrix = calculate_distance_matrix( user_locs, dist_metric=distance_matrix_metric) # invert such that close nodes have a high weight locs_distance_matrix = np.reciprocal(locs_distance_matrix, where=locs_distance_matrix != 0) org_ixs = user_locs['location_id'].values shape = locs_distance_matrix.shape loc_id_order = org_ixs if n is None or n == 'fconn': A = coo_matrix(locs_distance_matrix) edge_name = 'fconn_distant' else: # for every row, keep only the n largest elements for row_ix_this in range(shape[0]): row_this = locs_distance_matrix[row_ix_this, :] max_ixs = np.argsort(row_this)[::-1][0:n + 1] col_ixs = col_ixs + list(max_ixs) row_ixs = row_ixs + [row_ix_this for x in range(len(max_ixs))] values = values + list(row_this[max_ixs]) # enforce symmetry: col_ixs_temp = col_ixs.copy() col_ixs = col_ixs + row_ixs row_ixs = row_ixs + col_ixs_temp values = values + values A = coo_matrix((values, (row_ixs, col_ixs)), shape=shape) a = A.todense() edge_name = '{}_distant'.format(n) if user_id_this not in adjacency_dict: adjacency_dict[user_id_this] = { 'A': [A], 'loc_id_order': [loc_id_order], 'edge_name': [edge_name] } else: adjacency_dict[user_id_this]['A'].append(A) adjacency_dict[user_id_this]['loc_id_order'].append(loc_id_order) adjacency_dict[user_id_this]['edge_name'].append(edge_name) return adjacency_dict
def cluster_staypoints(staypoints, method='dbscan', epsilon=100, num_samples=1, distance_matrix_metric=None, agg_level='user'): """Clusters staypoints to get locations. Parameters ---------- staypoints : GeoDataFrame The staypoints have to follow the standard definition for staypoints DataFrames. method : str, {'dbscan'}, default 'dbscan' The following methods are available to cluster staypoints into locations: 'dbscan' : Uses the DBSCAN algorithm to cluster staypoints. epsilon : float, default 100 The epsilon for the 'dbscan' method. num_samples : int, default 1 The minimal number of samples in a cluster. distance_matrix_metric: str (optional) When given, dbscan will work on a precomputed a distance matrix that is created using the staypoints based on the given metric. Possible metrics are: {'haversine', 'euclidean'} or any mentioned in: https://scikit-learn.org/stable/modules/generated/ sklearn.metrics.pairwise_distances.html agg_level: str, {'user' or 'dataset'}, default 'user' The level of aggregation when generating locations: 'user' : locations are generated independently per-user. 'dataset' : shared locations are generated for all users. Returns ------- GeoDataFrame A new GeoDataFrame containing locations that a person visited multiple times. Examples -------- >>> spts.as_staypoints.cluster_staypoints(method='dbscan', epsilon=100, num_samples=1) """ if agg_level not in ['user', 'dataset']: raise AttributeError( "The parameter agg_level must be one of ['user', 'dataset'].") ret_sp = staypoints.copy() if method == 'dbscan': if distance_matrix_metric is not None: db = DBSCAN(eps=epsilon, min_samples=num_samples, metric='precomputed') else: db = DBSCAN(eps=epsilon, min_samples=num_samples) if agg_level == 'user': location_id_counter = 0 for user_id_this in ret_sp["user_id"].unique(): # Slice staypoints array by user. This is not a copy! user_staypoints = ret_sp[ret_sp["user_id"] == user_id_this] if distance_matrix_metric is not None: sp_distance_matrix = calculate_distance_matrix( user_staypoints, dist_metric=distance_matrix_metric) labels = db.fit_predict(sp_distance_matrix) else: coordinates = np.array([[g.x, g.y] for g in user_staypoints.geometry]) labels = db.fit_predict(coordinates) # enforce unique lables across all users without changing noise labels max_label = np.max(labels) labels[labels != -1] = labels[labels != -1] + location_id_counter + 1 if max_label > -1: location_id_counter = location_id_counter + max_label + 1 # add staypoint - location matching to original staypoints ret_sp.loc[user_staypoints.index, 'location_id'] = labels else: if distance_matrix_metric is not None: sp_distance_matrix = calculate_distance_matrix( ret_sp, dist_metric=distance_matrix_metric) labels = db.fit_predict(sp_distance_matrix) else: coordinates = np.array([[g.x, g.y] for g in ret_sp.geometry]) labels = db.fit_predict(coordinates) # add 1 to match the 'user' level result ret_sp['location_id'] = labels + 1 # create locations as grouped staypoints temp_sp = ret_sp[['user_id', 'location_id', ret_sp.geometry.name]] ret_loc = temp_sp.dissolve(by=['user_id', 'location_id'], as_index=False) # filter outlier ret_loc = ret_loc.loc[ret_loc['location_id'] != -1] # locations with only one staypoints is of type "Point" point_idx = ret_loc.geom_type == 'Point' ret_loc['center'] = 0 # initialize ret_loc.loc[point_idx, 'center'] = ret_loc.loc[point_idx, 'geom'] # locations with multiple staypoints is of type "MultiPoint" ret_loc.loc[~point_idx, 'center'] = ret_loc.loc[~point_idx, 'geom'].apply(lambda p: Point( np.array(p)[:, 0].mean(), np.array(p)[:, 1].mean())) # extent is the convex hull of the geometry ret_loc['extent'] = ret_loc['geom'].apply(lambda p: p.convex_hull) # convex_hull of one point would be a Point and two points a Linestring, # we change them into Polygon by creating a buffer of epsilon around them. pointLine_idx = (ret_loc['extent'].geom_type == 'LineString') | ( ret_loc['extent'].geom_type == 'Point') # Perform meter to decimal conversion if the distance metric is haversine if distance_matrix_metric == 'haversine': ret_loc.loc[ pointLine_idx, 'extent'] = ret_loc.loc[pointLine_idx].apply( lambda p: p['extent'].buffer( meters_to_decimal_degrees(epsilon, p['center'].y)), axis=1) else: ret_loc.loc[pointLine_idx, 'extent'] = ret_loc.loc[pointLine_idx].apply( lambda p: p['extent'].buffer(epsilon), axis=1) ret_loc = ret_loc.set_geometry('center') ret_loc = ret_loc[['user_id', 'location_id', 'center', 'extent']] ret_loc['location_id'] = ret_loc['location_id'].astype('int') return ret_sp, ret_loc
def cluster_staypoints(staypoints, method='dbscan', epsilon=100, num_samples=3, distance_matrix_metric=None): """Clusters staypoints to get places. Parameters ---------- staypoints : GeoDataFrame The staypoints have to follow the standard definition for staypoints DataFrames. method : str, {'dbscan'}, default 'dbscan' The following methods are available to cluster staypoints into places: 'dbscan' : Uses the DBSCAN algorithm to cluster staypoints. epsilon : float The epsilon for the 'dbscan' method. num_samples : int The minimal number of samples in a cluster. distance_matrix_metric: string (optional) When given, dbscan will work on a precomputed a distance matrix that is created using the staypoints based on the given metric. Possible metrics are: {'haversine', 'euclidean'} or any mentioned in: https://scikit-learn.org/stable/modules/generated/ sklearn.metrics.pairwise_distances.html Returns ------- GeoDataFrame A new GeoDataFrame containing places that a person visited multiple times. Examples -------- >>> spts.as_staypoints.cluster_staypoints(method='dbscan', epsilon=50, num_samples=3) """ ret_places = pd.DataFrame(columns=['user_id', 'place_id','center', 'extent']) if method=='dbscan': if distance_matrix_metric is not None: db = DBSCAN(eps=epsilon, min_samples=num_samples, metric='precomputed') else: db = DBSCAN(eps=epsilon, min_samples=num_samples) place_id_counter = 0 for user_id_this in staypoints["user_id"].unique(): # Slice staypoints array by user. This is not a copy! user_staypoints = staypoints[staypoints["user_id"] == user_id_this] if distance_matrix_metric is not None: sp_distance_matrix = calculate_distance_matrix( user_staypoints, dist_metric=distance_matrix_metric) labels = db.fit_predict(sp_distance_matrix) else: coordinates = np.array([[g.x, g.y] for g in user_staypoints['geom']]) labels = db.fit_predict(coordinates) # enforce unique lables across all users without changing noise # labels max_label = np.max(labels) labels[labels != -1] = labels[labels != -1] + place_id_counter +1 if max_label > -1: place_id_counter = place_id_counter + max_label + 1 # add staypoint - place matching to original staypoints staypoints.loc[user_staypoints.index,'place_id'] = labels # create places as grouped staypoints grouped_df = staypoints.groupby(['user_id','place_id']) for combined_id, group in grouped_df: user_id, place_id = combined_id if int(place_id) != -1: ret_place = {} ret_place['user_id'] = user_id ret_place['place_id'] = place_id # point geometry of place ret_place['center'] = Point(group.geometry.x.mean(), group.geometry.y.mean()) # polygon geometry of place ret_place['extent'] = MultiPoint(points=list(group.geometry)).convex_hull ret_places = ret_places.append(ret_place, ignore_index=True) ret_places = gpd.GeoDataFrame(ret_places, geometry='center', crs=staypoints.crs) ret_places['place_id'] = ret_places['place_id'].astype('int') return ret_places