def test_parquet(gp_point, gp_multipoint, gp_multiline, tmp_path): # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) df = GeoDataFrame({ 'point': GeoSeries(gp_point[:n]), 'multipoint': GeoSeries(gp_multipoint[:n]), 'multiline': GeoSeries(gp_multiline[:n]), 'a': list(range(n)) }) path = tmp_path / 'df.parq' to_parquet(df, path) df_read = read_parquet(path) assert isinstance(df_read, GeoDataFrame) assert all(df == df_read)
def test_parquet_columns(gp_point, gp_multipoint, gp_multiline, tmp_path): # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) df = GeoDataFrame({ 'point': GeoSeries(gp_point[:n]), 'multipoint': GeoSeries(gp_multipoint[:n]), 'multiline': GeoSeries(gp_multiline[:n]), 'a': list(range(n)) }) path = tmp_path / 'df.parq' to_parquet(df, path) columns = ['a', 'multiline'] df_read = read_parquet(str(path), columns=columns) assert isinstance(df_read, GeoDataFrame) pd.testing.assert_frame_equal(df[columns], df_read)
def test_parquet(gp_point, gp_multipoint, gp_multiline, tmp_path): # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) df = GeoDataFrame({ 'point': GeoSeries(gp_point[:n]), 'multipoint': GeoSeries(gp_multipoint[:n]), 'multiline': GeoSeries(gp_multiline[:n]), 'a': list(range(n)) }) df.index.name = 'range_idx' path = tmp_path / 'df.parq' to_parquet(df, path) df_read = read_parquet(str(path), columns=['point', 'multipoint', 'multiline', 'a']) assert isinstance(df_read, GeoDataFrame) pd.testing.assert_frame_equal(df, df_read) assert df_read.index.name == df.index.name
def split_device_trajectories( file: str, output: str, study: gpd.GeoDataFrame, split_opts: Dict[str, Any], min_length: float, min_duration: float, ) -> None: """Split device trajectories.""" out = f"{output}/device_{file.split('.')[-2]}.parquet" if exists(out): return None df = read_parquet(file)[[ "device_id", "latitude", "longitude", "timestamp" ]] if len(df) < 2: return None tc = split_trajectories(df, **split_opts) gdf = extract_traj_info(tc) gdf = gdf[(gdf["length"] >= min_length) & (gdf["duration"] >= min_duration)] gdf = gpd.sjoin(gdf, study).drop(columns=["index_right"]) sdf = spd.GeoDataFrame(gdf) to_parquet(sdf, out)