def test_read_staypoints_csv_crs_parameter(self): file = os.path.join('tests', 'data', 'staypoints.csv') crs = "EPSG:2056" stps = ti.read_staypoints_csv(file, sep=';', tz='utc', index_col="id") assert stps.crs is None stps = ti.read_staypoints_csv(file, sep=';', tz='utc', index_col="id", crs=crs) assert stps.crs == crs
def test_staypoints_csv_index_col(self): """Test if `index_col` can be set.""" file = os.path.join('tests', 'data', 'staypoints.csv') ind_name = 'id' pfs = ti.read_staypoints_csv(file, sep=";", index_col=ind_name) assert pfs.index.name == ind_name pfs = ti.read_staypoints_csv(file, sep=";", index_col=None) assert pfs.index.name is None
def test_set_index(self): """Test if `index_col` can be set.""" file = os.path.join("tests", "data", "staypoints.csv") ind_name = "id" pfs = ti.read_staypoints_csv(file, sep=";", index_col=ind_name) assert pfs.index.name == ind_name pfs = ti.read_staypoints_csv(file, sep=";", index_col=None) assert pfs.index.name is None
def test_set_crs(self): """Test setting the crs when reading.""" file = os.path.join("tests", "data", "staypoints.csv") crs = "EPSG:2056" sp = ti.read_staypoints_csv(file, sep=";", tz="utc", index_col="id") assert sp.crs is None sp = ti.read_staypoints_csv(file, sep=";", tz="utc", index_col="id", crs=crs) assert sp.crs == crs
def test_staypoints_from_to_csv(self): orig_file = os.path.join('tests', 'data', 'staypoints.csv') mod_file = os.path.join('tests', 'data', 'staypoints_mod_columns.csv') tmp_file = os.path.join('tests', 'data', 'staypoints_test.csv') stps = ti.read_staypoints_csv(orig_file, sep=';', tz='utc', index_col="id") mod_stps = ti.read_staypoints_csv(mod_file, columns={'User': '******'}, sep=';', index_col="id") assert mod_stps.equals(stps) stps['started_at'] = stps['started_at'].apply(lambda d: d.isoformat().replace('+00:00', 'Z')) stps['finished_at'] = stps['finished_at'].apply(lambda d: d.isoformat().replace('+00:00', 'Z')) columns = ['user_id', 'started_at', 'finished_at', 'elevation', 'geom'] stps.as_staypoints.to_csv(tmp_file, sep=';', columns=columns) assert filecmp.cmp(orig_file, tmp_file, shallow=False) os.remove(tmp_file)
def test_generate_locations_dtype_consistent(self): """Test the dtypes for the generated columns.""" stps_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") stps = ti.read_staypoints_csv(stps_file, tz="utc", index_col="id") # stps, locs = stps.as_staypoints.generate_locations( method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset") assert stps["user_id"].dtype == locs["user_id"].dtype assert stps["location_id"].dtype == "Int64" assert locs.index.dtype == "int64" # change the user_id to string stps["user_id"] = stps["user_id"].apply(lambda x: str(x)) stps, locs = stps.as_staypoints.generate_locations( method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset") assert stps["user_id"].dtype == locs["user_id"].dtype assert stps["location_id"].dtype == "Int64" assert locs.index.dtype == "int64"
def test_generate_locations_dbscan_hav_euc(self): stps_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") stps = ti.read_staypoints_csv(stps_file, tz="utc", index_col="id") # haversine calculation _, loc_har = stps.as_staypoints.generate_locations( method="dbscan", epsilon=100, num_samples=0, distance_metric="haversine", agg_level="dataset") # WGS_1984 stps.crs = "epsg:4326" # WGS_1984_UTM_Zone_49N stps = stps.to_crs("epsg:32649") # euclidean calculation _, loc_eu = stps.as_staypoints.generate_locations( method="dbscan", epsilon=100, num_samples=0, distance_metric="euclidean", agg_level="dataset") assert len(loc_har) == len( loc_eu ), "The #location should be the same for haversine" + "and euclidean distances"
def test_filter_staypoints(self): # read staypoints and area file spts_file = os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv') spts = ti.read_staypoints_csv(spts_file, tz='utc', index_col='id') extent = gpd.read_file( os.path.join('tests', 'data', 'area', 'tsinghua.geojson')) # the projection needs to be defined: WGS84 spts.crs = 'epsg:4326' within_spts = spts.as_staypoints.spatial_filter(areas=extent, method="within", re_project=True) intersects_spts = spts.as_staypoints.spatial_filter( areas=extent, method="intersects", re_project=True) crosses_spts = spts.as_staypoints.spatial_filter(areas=extent, method="crosses", re_project=True) # the result obtained from ArcGIS gis_within_num = 13 assert len(within_spts) == gis_within_num, "The spatial filtered sp number should be the same as" + \ "the one from the result with ArcGIS" assert all(within_spts.geometry == intersects_spts.geometry), "For sp the result of within and" + \ "intersects should be the same" assert len(crosses_spts) == 0, "There will be no point crossing area"
def test_cluster_staypoints_dbscan_user_dataset(self): spts = ti.read_staypoints_csv( 'tests/data/geolife/geolife_staypoints.csv') # take the first row and duplicate once spts = spts.head(1) spts = spts.append(spts, ignore_index=True) # assign a different user_id to the second row spts.iloc[1, 5] = 1 # duplicate for a certain number spts = spts.append([spts] * 5, ignore_index=True) _, locs_ds = spts.as_staypoints.extract_locations( method='dbscan', epsilon=10, num_samples=0, distance_matrix_metric='haversine', agg_level='dataset') _, locs_us = spts.as_staypoints.extract_locations( method='dbscan', epsilon=10, num_samples=0, distance_matrix_metric='haversine', agg_level='user') loc_ds_num = locs_ds['location_id'].unique().shape[0] loc_us_num = locs_us['location_id'].unique().shape[0] assert loc_ds_num == 1, "Considering all staypoints at once, there should be only one location" assert loc_us_num == 2, "Considering user staypoints separately, there should be two locations"
def test_generate_locations_dtype_consistent(self): """Test the dtypes for the generated columns.""" stps_file = os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv') stps = ti.read_staypoints_csv(stps_file, tz='utc', index_col='id') # stps, locs = stps.as_staypoints.generate_locations( method='dbscan', epsilon=10, num_samples=0, distance_matrix_metric='haversine', agg_level='dataset') assert stps['user_id'].dtype == locs['user_id'].dtype assert stps['location_id'].dtype == 'Int64' assert locs.index.dtype == 'int64' # change the user_id to string stps['user_id'] = stps['user_id'].apply(lambda x: str(x)) stps, locs = stps.as_staypoints.generate_locations( method='dbscan', epsilon=10, num_samples=0, distance_matrix_metric='haversine', agg_level='dataset') assert stps['user_id'].dtype == locs['user_id'].dtype assert stps['location_id'].dtype == 'Int64' assert locs.index.dtype == 'int64'
def test_generate_locations_dbscan_user_dataset(self): stps_file = os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv') stps = ti.read_staypoints_csv(stps_file, tz='utc', index_col='id') # take the first row and duplicate once stps = stps.head(1) stps = stps.append(stps, ignore_index=True) # assign a different user_id to the second row stps.iloc[1, 4] = 1 # duplicate for a certain number stps = stps.append([stps] * 5, ignore_index=True) _, locs_ds = stps.as_staypoints.generate_locations( method='dbscan', epsilon=10, num_samples=0, distance_matrix_metric='haversine', agg_level='dataset') _, locs_us = stps.as_staypoints.generate_locations( method='dbscan', epsilon=10, num_samples=0, distance_matrix_metric='haversine', agg_level='user') loc_ds_num = len(locs_ds.index.unique()) loc_us_num = len(locs_us.index.unique()) assert loc_ds_num == 1, "Considering all staypoints at once, there should be only one location" assert loc_us_num == 2, "Considering user staypoints separately, there should be two locations"
def test_dbscan_user_dataset(self): """Test user and dataset location generation.""" sp_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") sp = ti.read_staypoints_csv(sp_file, tz="utc", index_col="id") # take the first row and duplicate once sp = sp.head(1) sp = sp.append(sp, ignore_index=True) # assign a different user_id to the second row sp.iloc[1, 4] = 1 # duplicate for a certain number sp = sp.append([sp] * 5, ignore_index=True) _, locs_ds = sp.as_staypoints.generate_locations( method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset") _, locs_us = sp.as_staypoints.generate_locations( method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="user") loc_dataset_num = len(locs_ds.index.unique()) loc_user_num = len(locs_us.index.unique()) assert loc_dataset_num == 1 assert loc_user_num == 2
def test_dbscan_hav_euc(self): """Test if using haversine and euclidean distances will generate the same location result.""" sp_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") sp = ti.read_staypoints_csv(sp_file, tz="utc", index_col="id") # haversine calculation _, loc_har = sp.as_staypoints.generate_locations( method="dbscan", epsilon=100, num_samples=0, distance_metric="haversine", agg_level="dataset") # WGS_1984 sp.crs = "epsg:4326" # WGS_1984_UTM_Zone_49N sp = sp.to_crs("epsg:32649") # euclidean calculation _, loc_eu = sp.as_staypoints.generate_locations( method="dbscan", epsilon=100, num_samples=0, distance_metric="euclidean", agg_level="dataset") assert len(loc_har) == len(loc_eu)
def test_keyword_combinations(self): spts_file = os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv') spts = ti.read_staypoints_csv(spts_file, tz='utc', index_col='id') x = spts.iloc[0:5] y = spts.iloc[5:15] _ = calculate_distance_matrix(X=x, Y=y, dist_metric='euclidean', n_jobs=-1) _ = calculate_distance_matrix(X=y, Y=x, dist_metric='haversine', n_jobs=-1) d_mink1 = calculate_distance_matrix(X=x, Y=x, dist_metric='minkowski', p=1) d_mink2 = calculate_distance_matrix(X=x, Y=x, dist_metric='minkowski', p=2) d_euc = calculate_distance_matrix(X=x, Y=x, dist_metric='euclidean') assert not np.array_equal(d_mink1, d_mink2) assert np.array_equal(d_euc, d_mink2)
def test_cluster_staypoints_dbscan_loc(self): spts = ti.read_staypoints_csv( 'tests/data/geolife/geolife_staypoints.csv') spts, locs = spts.as_staypoints.extract_locations( method='dbscan', epsilon=10, num_samples=0, distance_matrix_metric='haversine', agg_level='dataset') # create locations as grouped staypoints, another way to create locations other_locs = pd.DataFrame(columns=['user_id', 'location_id', 'center']) grouped_df = spts.groupby(['user_id', 'location_id']) for combined_id, group in grouped_df: user_id, location_id = combined_id group.set_geometry(spts.geometry.name, inplace=True) if int(location_id) != -1: temp_loc = {} temp_loc['user_id'] = user_id temp_loc['location_id'] = location_id # point geometry of place temp_loc['center'] = Point(group.geometry.x.mean(), group.geometry.y.mean()) other_locs = other_locs.append(temp_loc, ignore_index=True) other_locs = gpd.GeoDataFrame(other_locs, geometry='center', crs=spts.crs) assert all(other_locs['center'] == locs['center']), "The location geometry should be the same" assert all(other_locs['location_id'] == locs['location_id']), "The location id should be the same" + \ "and start from one"
def test_keyword_combinations(self): stps_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") stps = ti.read_staypoints_csv(stps_file, tz="utc", index_col="id") x = stps.iloc[0:5] y = stps.iloc[5:15] _ = calculate_distance_matrix(X=x, Y=y, dist_metric="euclidean", n_jobs=-1) _ = calculate_distance_matrix(X=y, Y=x, dist_metric="haversine", n_jobs=-1) d_mink1 = calculate_distance_matrix(X=x, Y=x, dist_metric="minkowski", p=1) d_mink2 = calculate_distance_matrix(X=x, Y=x, dist_metric="minkowski", p=2) d_euc = calculate_distance_matrix(X=x, Y=x, dist_metric="euclidean") assert not np.array_equal(d_mink1, d_mink2) assert np.array_equal(d_euc, d_mink2)
def test_filter_staypoints(self): """Test if spatial_filter works for staypoints.""" # read staypoints and area file sp_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") sp = ti.read_staypoints_csv(sp_file, tz="utc", index_col="id") extent = gpd.read_file( os.path.join("tests", "data", "area", "tsinghua.geojson")) # the projection needs to be defined: WGS84 sp.crs = "epsg:4326" within_sp = sp.as_staypoints.spatial_filter(areas=extent, method="within", re_project=True) intersects_sp = sp.as_staypoints.spatial_filter(areas=extent, method="intersects", re_project=True) crosses_sp = sp.as_staypoints.spatial_filter(areas=extent, method="crosses", re_project=True) # the result obtained from ArcGIS gis_within_num = 13 assert len(within_sp) == gis_within_num, ( "The spatial filtered sp number should be the same as" + "the one from the result with ArcGIS") assert len(crosses_sp) == 0, "There will be no point crossing area" # For staypoints the result of within and intersects should be the same assert_geodataframe_equal(within_sp, intersects_sp, check_less_precise=True)
def test_generate_locations_dbscan_hav_euc(self): stps_file = os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv') stps = ti.read_staypoints_csv(stps_file, tz='utc', index_col='id') # haversine calculation _, loc_har = stps.as_staypoints.generate_locations( method='dbscan', epsilon=100, num_samples=0, distance_matrix_metric='haversine', agg_level='dataset') # WGS_1984 stps.crs = 'epsg:4326' # WGS_1984_UTM_Zone_49N stps = stps.to_crs("epsg:32649") # euclidean calculation _, loc_eu = stps.as_staypoints.generate_locations( method='dbscan', epsilon=100, num_samples=0, distance_matrix_metric='euclidean', agg_level='dataset') assert len(loc_har) == len(loc_eu) , "The #location should be the same for haversine" + \ "and euclidean distances"
def test_dbscan_loc(self): """Test haversine dbscan location result with manually grouping the locations method.""" stps_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") stps = ti.read_staypoints_csv(stps_file, tz="utc", index_col="id") stps, locs = stps.as_staypoints.generate_locations( method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset" ) # create locations as grouped staypoints, another way to create locations other_locs = pd.DataFrame(columns=["user_id", "id", "center"]) grouped_df = stps.groupby(["user_id", "location_id"]) for combined_id, group in grouped_df: user_id, location_id = combined_id group.set_geometry(stps.geometry.name, inplace=True) if int(location_id) != -1: temp_loc = {} temp_loc["user_id"] = user_id temp_loc["id"] = location_id # point geometry of place temp_loc["center"] = Point(group.geometry.x.mean(), group.geometry.y.mean()) other_locs = other_locs.append(temp_loc, ignore_index=True) other_locs = gpd.GeoDataFrame(other_locs, geometry="center", crs=stps.crs) other_locs.set_index("id", inplace=True) assert all(other_locs["center"] == locs["center"]) assert all(other_locs.index == locs.index)
def test_haversine_vectorized(self): spts = ti.read_staypoints_csv( os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv')) x = spts.geometry.x.values y = spts.geometry.y.values n = len(x) # our distance ix_1, ix_2 = np.triu_indices(n, k=1) x1 = x[ix_1] y1 = y[ix_1] x2 = x[ix_2] y2 = y[ix_2] d_ours = haversine_dist(x1, y1, x2, y2) # their distance x_rad = np.asarray([radians(_) for _ in x]) y_rad = np.asarray([radians(_) for _ in y]) yx = np.concatenate((y_rad.reshape(-1, 1), x_rad.reshape(-1, 1)), axis=1) D_theirs = haversine_distances(yx, yx) * 6371000 d_theirs = D_theirs[ix_1, ix_2] assert np.sum( np.abs(d_ours - d_theirs)) < 0.01 # 1cm for 58 should be good enough
def test_cluster_staypoints_dbscan_hav_euc(self): spts = ti.read_staypoints_csv( os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv')) # haversine calculation _, loc_har = spts.as_staypoints.extract_locations( method='dbscan', epsilon=100, num_samples=0, distance_matrix_metric='haversine', agg_level='dataset') # WGS_1984 spts.crs = 'epsg:4326' # WGS_1984_UTM_Zone_49N spts = spts.to_crs("epsg:32649") # euclidean calculation _, loc_eu = spts.as_staypoints.extract_locations( method='dbscan', epsilon=100, num_samples=0, distance_matrix_metric='euclidean', agg_level='dataset') assert len(loc_har) == len(loc_eu) , "The #location should be the same for haversine" + \ "and euclidean distances"
def test_as_staypoints_accessor(self): orig_file = 'tests/data/staypoints.csv' stps = ti.read_staypoints_csv(orig_file, sep=';') assert stps.as_staypoints stps = stps.drop(['geom'], axis=1) with pytest.raises(AttributeError): stps.as_staypoints
def test_as_staypoints_accessor(self): stps_file = os.path.join('tests', 'data', 'staypoints.csv') stps = ti.read_staypoints_csv(stps_file, sep=';', index_col='id') assert stps.as_staypoints stps = stps.drop(['geom'], axis=1) with pytest.raises(AttributeError): stps.as_staypoints
def test_staypoints_from_gpd(self): gdf = gpd.read_file(os.path.join('tests', 'data', 'staypoints.geojson')) gdf.set_index('id', inplace=True) stps_from_gpd = ti.io.from_geopandas.staypoints_from_gpd(gdf, 'start_time', 'end_time', geom='geometry', tz='utc') stps_file = os.path.join('tests', 'data', 'staypoints.csv') stps_from_csv = ti.read_staypoints_csv(stps_file, sep=';', tz='utc', index_col='id') pd.testing.assert_frame_equal(stps_from_gpd, stps_from_csv, check_exact=False)
def test_create_activity_flag(self): spts_test = ti.read_staypoints_csv( os.path.join('tests', 'data', 'geolife', 'geolife_staypoints.csv')) activity_true = spts_test['activity'].copy() spts_test['activity'] = False spts_test = spts_test.as_staypoints.create_activity_flag() pd.testing.assert_series_equal(spts_test['activity'], activity_true)
def testdata_locs(): """Read location test data from files.""" sp_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") sp = ti.read_staypoints_csv(sp_file, tz="utc", index_col="id") sp, locs = sp.as_staypoints.generate_locations(method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset") return locs
def test_staypoints_plot(self): tmp_file = 'tests/data/staypoints_plot.png' pfs = ti.read_positionfixes_csv('tests/data/positionfixes.csv', sep=';') stps = ti.read_staypoints_csv('tests/data/staypoints.csv', sep=';') stps.as_staypoints.plot(out_filename=tmp_file, radius=0.01, positionfixes=pfs, plot_osm=False) assert os.path.exists(tmp_file) os.remove(tmp_file)
def test_create_activity_flag(self): """Test if 'activity' = True is assigned to staypoints.""" stps_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") stps_test = ti.read_staypoints_csv(stps_file, tz="utc", index_col="id") activity_true = stps_test["activity"].copy() stps_test["activity"] = False stps_test = stps_test.as_staypoints.create_activity_flag() pd.testing.assert_series_equal(stps_test["activity"], activity_true)
def test_print_progress_flag(self, capsys): """Test if the print_progress bar controls the printing behavior.""" file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") staypoints = ti.read_staypoints_csv(file, tz="utc", index_col="id") staypoints.as_staypoints.generate_locations(print_progress=True) captured_print = capsys.readouterr() assert captured_print.err != "" staypoints.as_staypoints.generate_locations(print_progress=False) captured_print = capsys.readouterr() assert captured_print.err == ""
def test_staypoints_plot(self): """Use trackintel visualization function to plot staypoints and check if file exists.""" tmp_file = os.path.join('tests', 'data', 'staypoints_plot.png') pfs_file = os.path.join('tests', 'data', 'positionfixes.csv') pfs = ti.read_positionfixes_csv(pfs_file, sep=';', index_col='id', crs='EPSG:4326') stps_file = os.path.join('tests', 'data', 'staypoints.csv') stps = ti.read_staypoints_csv(stps_file, sep=';', index_col='id', crs='EPSG:4326') stps.as_staypoints.plot(out_filename=tmp_file, radius=0.01, positionfixes=pfs, plot_osm=False) assert os.path.exists(tmp_file) os.remove(tmp_file)