def create_all_polygons_on_grid(self): """ Create all polygons that are represented in a grid. Stores the polygons in the `grid_polygon` key """ operation = begin_operation('create_all_polygons_on_grid') logger.debug('\nCreating all polygons on virtual grid') grid_polygon = np.array([[None for _ in range(self.grid_size_lon_x)] for _ in range(self.grid_size_lat_y)]) lat_init = self.lat_min_y cell_size = self.cell_size_by_degree for i in progress_bar(range(self.grid_size_lat_y), desc='Creating polygons'): lon_init = self.lon_min_x for j in range(self.grid_size_lon_x): # Cria o polygon da célula grid_polygon[i][j] = Polygon( ((lon_init, lat_init), (lon_init, lat_init + cell_size), (lon_init + cell_size, lat_init + cell_size), (lon_init + cell_size, lat_init))) lon_init += cell_size lat_init += cell_size self.grid_polygon = grid_polygon logger.debug('...geometries saved on Grid grid_polygon property') self.last_operation = end_operation(operation)
def _prepare_segmentation(move_data: DataFrame, label_id: str, label_new_tid: str): """ Resets the dataframe index, collects unique ids and initiates curr_id and count. Parameters ---------- move_data : dataframe Dataframe to be filtered label_id : str label of the feature label_new_tid : str label of the new feature Returns ------- int initial curr_tid numpy.ndarray unique ids int initial count """ if move_data.index.name is None: logger.debug(f'...setting {label_id} as index') move_data.set_index(label_id, inplace=True) curr_tid = 0 if label_new_tid not in move_data: move_data[label_new_tid] = curr_tid ids = move_data.index.unique() count = 0 return curr_tid, ids, count
def create_all_polygons_to_all_point_on_grid(self, data: DataFrame) -> DataFrame: """ Create all polygons to all points represented in a grid. Parameters ---------- data : DataFrame Represents the dataset with contains lat, long and datetime Returns ------- DataFrame Represents the same dataset with new key 'polygon' where polygons were saved. """ operation = begin_operation('create_all_polygons_to_all_point_on_grid') if INDEX_GRID_LAT not in data or INDEX_GRID_LON not in data: self.create_update_index_grid_feature(data, unique_index=False) datapolygons = data[[TRAJ_ID, INDEX_GRID_LAT, INDEX_GRID_LON]].drop_duplicates() polygons = datapolygons.apply( lambda row: self.create_one_polygon_to_point_on_grid( row[INDEX_GRID_LAT], row[INDEX_GRID_LON]), axis=1) logger.debug('...polygons were created') datapolygons['polygon'] = polygons self.last_operation = end_operation(operation) return datapolygons
def point_to_index_grid(self, event_lat: float, event_lon: float) -> tuple[int, int]: """ Locate the coordinates x and y in a grid of point (lat, long). Parameters ---------- event_lat : float Represents the latitude of a point event_lon : float Represents the longitude of a point Returns ------- Tuple[int, int] Represents the index y in a grid of a point (lat, long) Represents the index x in a grid of a point (lat, long) """ operation = begin_operation('create_all_polygons_to_all_point_on_grid') indexes_lat_y = np.floor((np.float64(event_lat) - self.lat_min_y) / self.cell_size_by_degree) indexes_lon_x = np.floor((np.float64(event_lon) - self.lon_min_x) / self.cell_size_by_degree) logger.debug('...[%s,%s] indexes were created to lat and lon' % (indexes_lat_y.size, indexes_lon_x.size)) self.last_operation = end_operation(operation) return indexes_lat_y, indexes_lon_x
def by_max_speed( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_id: str = TRAJ_ID, max_speed_between_adj_points: float = 50.0, drop_single_points: bool = True, label_new_tid: str = TID_SPEED, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Splits the trajectories into segments based on a maximum speed. Parameters ---------- move_data : dataframe. The input trajectory data. label_id : str, optional Indicates the label of the id column in the users dataframe, by default TRAJ_ID max_speed_between_adj_points : float, optional Specify the maximum speed between two adjacent points, by default 50 drop_single_points : boolean, optional If set to True, drops the trajectories with only one point, by default True label_new_tid : str, optional The label of the column containing the ids of the formed segments, by default TID_SPEED Is the new splitted id. inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame DataFrame with the aditional features: label_segment, that indicates the trajectory segment to which the point belongs to Note ---- Speed features must be updated after split. """ if not inplace: move_data = move_data.copy() logger.debug( 'Split trajectories by max_speed_between_adj_points: {}'.format( max_speed_between_adj_points)) if SPEED_TO_PREV not in move_data: move_data.generate_dist_time_speed_features() move_data = _filter_by(move_data, label_id, label_new_tid, drop_single_points, feature=SPEED_TO_PREV, max_between_adj_points=max_speed_between_adj_points, all=False) if not inplace: return move_data
def clean_gps_speed_max_radius( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_id: str = TRAJ_ID, speed_max: float = 50.0, label_dtype: Callable = np.float64, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Removes trajectories points with higher speed. Given any point p of the trajectory, the point will be removed if one of the following happens: if the travel speed from the point before p to p is greater than the max value of speed between adjacent points set by the user. Or the travel speed between point p and the next point is greater than the value set by the user. When the cleaning is done, the function will update the time and distance features in the dataframe and will call itself again. The function will finish processing when it can no longer find points disrespecting the limit of speed. Parameters ---------- move_data : dataframe The input trajectory data label_id : str, optional Indicates the label of the id column in the user dataframe, by default TRAJ_ID speed_max : float, optional Indicates the maximum value a point speed_to_prev and speed_to_next should have, in order not to be dropped, by default 50 label_dtype : type, optional Represents column id type, by default np.float64. inplace : boolean, optional if set to true the operation is done in place, the original dataframe will be altered and None is returned, by default False Returns ------- DataFrame The filtered trajectories without the gps nearby points or None """ if not inplace: move_data = move_data.copy() if SPEED_TO_PREV not in move_data: move_data.generate_dist_time_speed_features(label_id=label_id, label_dtype=label_dtype) logger.debug('\nClean gps points with speed max > %s meters by seconds' % speed_max) move_data = _clean_gps(move_data, _filter_speed_max_radius, arg1=SPEED_TO_PREV, arg2=speed_max, outliers=False) if not inplace: return move_data
def outliers( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', jump_coefficient: float = 3.0, threshold: float = 1, new_label: str = OUTLIER, inplace: bool = False ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Create or update a boolean feature to detect outliers. Parameters ---------- move_data : dataframe The input trajectory data jump_coefficient : float, optional by default 3 threshold : float, optional Minimum value that the distance features must have in order to be considered outliers, by default 1 new_label: string, optional The name of the new feature with detected points out of the bbox, by default OUTLIER inplace : bool, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame Returns a dataframe with the trajectories outliers or None """ if not inplace: move_data = move_data.copy() if DIST_TO_PREV not in move_data: move_data.generate_dist_features() if move_data.index.name is not None: logger.debug('...Reset index for filtering\n') move_data.reset_index(inplace=True) if (DIST_TO_PREV in move_data and DIST_TO_NEXT and DIST_PREV_TO_NEXT in move_data): jump = jump_coefficient * move_data[DIST_PREV_TO_NEXT] filter_ = ((move_data[DIST_TO_NEXT] > threshold) & (move_data[DIST_TO_PREV] > threshold) & (move_data[DIST_PREV_TO_NEXT] > threshold) & (jump < move_data[DIST_TO_NEXT]) & (jump < move_data[DIST_TO_PREV])) move_data[new_label] = filter_ else: logger.warning('...Distances features were not created') if not inplace: return move_data
def create_or_update_gps_block_signal( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', max_time_stop: float = 7200, new_label: str = BLOCK, label_tid: str = TID_PART, inplace: bool = False ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Creates a new feature that inform segments with periods without moving. Parameters ---------- move_data: dataFrame The input trajectories data. max_time_stop: float, optional Maximum time allowed with speed 0, by default 7200 new_label: string, optional The name of the new feature with detected deactivated signals, by default BLOCK label_tid : str, optional The label of the column containing the ids of the formed segments, by default TID_PART Is the new slitted id. inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame DataFrame with the additional features or None 'dist_to_prev', 'time_to_prev', 'speed_to_prev', 'tid_dist', 'block_signal' """ if not inplace: move_data = move_data.copy() message = 'Create or update block_signal if max time stop > %s seconds\n' logger.debug(message % max_time_stop) segmentation.by_max_dist(move_data, max_dist_between_adj_points=0.0, label_new_tid=label_tid, inplace=True) logger.debug('Updating dist time speed values') move_data.generate_dist_time_speed_features(label_id=label_tid) move_data[new_label] = False df_agg_tid = move_data.groupby(by=label_tid).agg({TIME_TO_PREV: 'sum'}) filter_ = df_agg_tid[TIME_TO_PREV] >= max_time_stop idx = df_agg_tid[filter_].index move_data.loc[move_data[label_tid].isin(idx), new_label] = True return _end_create_operation(move_data, new_label, inplace)
def elbow_method(move_data: DataFrame, k_initial: int = 1, max_clusters: int = 15, k_iteration: int = 1, random_state: int | None = None) -> dict: """ Determines the optimal number of clusters. In the range set by the user using the elbow method. Parameters ---------- move_data : dataframe The input trajectory data. k_initial: int, optional The initial value used in the interaction of the elbow method. Represents the maximum numbers of clusters, by default 1 max_clusters: int, optional The maximum value used in the interaction of the elbow method. Maximum number of clusters to test for, by default 15 k_iteration: int, optional Increment value of the sequence used by the elbow method, by default 1 random_state: int, RandomState instance Determines random number generation for centroid initialization. Use an int to make the randomness deterministic, by default None Returns ------- dict The inertia values for the different numbers of clusters Example ------- clustering.elbow_method(move_data=move_df, k_iteration=3) { 1: 55084.15957839036, 4: 245.68365592382938, 7: 92.31472644640075, 10: 62.618599956870355, 13: 45.59653757292055, } """ message = 'Executing Elbow Method for {} to {} clusters at {} steps\n'.format( k_initial, max_clusters, k_iteration) logger.debug(message) inertia_dic = {} for k in progress_bar(range(k_initial, max_clusters + 1, k_iteration), desc='Running KMeans'): km = KMeans(n_clusters=k, random_state=random_state) inertia_dic[k] = km.fit(move_data[[LATITUDE, LONGITUDE]]).inertia_ return inertia_dic
def clean_gps_jumps_by_distance( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_id: str = TRAJ_ID, jump_coefficient: float = 3.0, threshold: float = 1, label_dtype: Callable = np.float64, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Removes the trajectories points that are outliers from the dataframe. Parameters ---------- move_data : dataframe The input trajectory data label_id : str, optional Indicates the label of the id column in the user dataframe, by default TRAJ_ID jump_coefficient : float, optional by default 3 threshold : float, optional Minimum value that the distance features must have in order to be considered outliers, by default 1 label_dtype : type, optional Represents column id type, by default np.float64. inplace : boolean, optional if set to true the operation is done in place, the original dataframe will be altered and None is returned, by default False Returns ------- DataFrame The filtered trajectories without the gps jumps or None """ if not inplace: move_data = move_data.copy() if DIST_TO_PREV not in move_data: move_data.generate_dist_features(label_id=label_id, label_dtype=label_dtype) logger.debug( '\nCleaning gps jumps by distance to jump_coefficient %s...\n' % jump_coefficient) move_data = _clean_gps(move_data, outliers, arg1=jump_coefficient, arg2=threshold, outliers=True) if not inplace: return move_data
def _drop_single_point(move_data: DataFrame, label_new_tid: str, label_id: str): """ Removes trajectory with single point. Parameters ---------- move_data: dataframe dataframe with trajectories label_new_tid : str The label of the column containing the ids of the formed segments. Is the new splitted id. label_id : str Indicates the label of the id column in the user dataframe, by default TRAJ_ID """ shape_before_drop = move_data.shape idx = move_data[move_data[label_new_tid] == -1].index if idx.shape[0] > 0: logger.debug('...Drop Trajectory with a unique GPS point\n') ids_before_drop = move_data[label_id].unique().shape[0] move_data.drop(index=idx, inplace=True) logger.debug('...Object - before drop: {} - after drop: {}'.format( ids_before_drop, move_data[label_id].unique().shape[0])) logger.debug('...Shape - before drop: {} - after drop: {}'.format( shape_before_drop, move_data.shape)) else: logger.debug('...No trajectories with only one point.')
def discretize_based_grid(self, region_size: int = 1000): """ Discrete space in cells of the same size, assigning a unique id to each cell. Parameters ---------- region_size: int, optional Size of grid cell, by default 1000 """ operation = begin_operation('discretize based on grid') logger.debug('\nDiscretizing dataframe...') grid = Grid(self, cell_size=region_size) grid.create_update_index_grid_feature(self) self.reset_index(drop=True, inplace=True) self.last_operation = end_operation(operation)
def clean_gps_nearby_points_by_distances( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_id: str = TRAJ_ID, radius_area: float = 10.0, label_dtype: Callable = np.float64, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Removes points from the trajectories with smaller distance from the point before. Parameters ---------- move_data : dataframe The input trajectory data label_id : str, optional Indicates the label of the id column in the user dataframe, by default TRAJ_ID radius_area : float, optional Species the minimum distance a point must have to it"srs previous point in order not to be dropped, by default 10 label_dtype : type, optional Represents column id type, ,y default np.float64. inplace : boolean, optional if set to true the operation is done in place, the original dataframe will be altered and None is returned, be default False Returns ------- DataFrame The filtered trajectories without the gps nearby points by distance or None """ if not inplace: move_data = move_data.copy() if DIST_TO_PREV not in move_data: move_data.generate_dist_features(label_id=label_id, label_dtype=label_dtype) logger.debug('\nCleaning gps points from radius of %s meters\n' % radius_area) move_data = _clean_gps(move_data, _filter_single_by_max, arg1=DIST_TO_PREV, arg2=radius_area, outliers=False) if not inplace: return move_data
def clean_gps_nearby_points_by_speed( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_id: str = TRAJ_ID, speed_radius: float = 0.0, label_dtype: Callable = np.float64, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Removes points from the trajectories with smaller speed of travel. Parameters ---------- move_data : dataframe The input trajectory data label_id : str, optional Indicates the label of the id column in the user dataframe, be defalt TRAJ_ID speed_radius : float, optional Species the minimum speed a point must have from it"srs previous point, in order not to be dropped, by default 0 label_dtype : type, optional Represents column id type, by default np.float64. inplace : boolean, optional if set to true the operation is done in place, the original dataframe will be altered and None is returned, by default False Returns ------- DataFrame The filtered trajectories without the gps nearby points by speed or None """ if not inplace: move_data = move_data.copy() if SPEED_TO_PREV not in move_data: move_data.generate_dist_time_speed_features(label_id=label_id, label_dtype=label_dtype) logger.debug('\nCleaning gps points using %s speed radius\n' % speed_radius) move_data = _clean_gps(move_data, _filter_single_by_max, arg1=SPEED_TO_PREV, arg2=speed_radius, outliers=False) if not inplace: return move_data
def _create_virtual_grid(self, data: DataFrame, cell_size: float, meters_by_degree: float): """ Create a virtual grid based in dataset bound box. Parameters ---------- data : DataFrame Represents the dataset with contains lat, long and datetime cell_size : float Size of grid cell meters_by_degree : float Represents the meters degree of latitude """ operation = begin_operation('_create_virtual_grid') bbox = data.get_bbox() logger.debug('\nCreating a virtual grid without polygons') cell_size_by_degree = cell_size / meters_by_degree logger.debug('...cell size by degree: %s' % cell_size_by_degree) lat_min_y = bbox[0] lon_min_x = bbox[1] lat_max_y = bbox[2] lon_max_x = bbox[3] # If cell size does not fit in the grid area, an expansion is made if math.fmod((lat_max_y - lat_min_y), cell_size_by_degree) != 0: lat_max_y = lat_min_y + cell_size_by_degree * (math.floor( (lat_max_y - lat_min_y) / cell_size_by_degree) + 1) if math.fmod((lon_max_x - lon_min_x), cell_size_by_degree) != 0: lon_max_x = lon_min_x + cell_size_by_degree * (math.floor( (lon_max_x - lon_min_x) / cell_size_by_degree) + 1) # adjust grid size to lat and lon grid_size_lat_y = int( round((lat_max_y - lat_min_y) / cell_size_by_degree)) grid_size_lon_x = int( round((lon_max_x - lon_min_x) / cell_size_by_degree)) logger.debug('...grid_size_lat_y:%s\ngrid_size_lon_x:%s' % (grid_size_lat_y, grid_size_lon_x)) self.lon_min_x = lon_min_x self.lat_min_y = lat_min_y self.grid_size_lat_y = grid_size_lat_y self.grid_size_lon_x = grid_size_lon_x self.cell_size_by_degree = cell_size_by_degree logger.debug('\n..A virtual grid was created') self.last_operation = end_operation(operation)
def _update_curr_tid_count(filter_: ndarray, move_data: DataFrame, idx: int, label_new_tid: str, curr_tid: int, count: int) -> tuple[int, int]: """ Updates the tid. Parameters ---------- filter_ : numpy.ndarray Filtered indexes move_data : dataframe Dataframe to be filtered idx : int row to compare label_new_tid : str label of the new feature curr_tid : int current tid count : int count of Returns ------- int updated current tid int updated count ids """ curr_tid += 1 if filter_.shape == (): logger.debug(f'id: {idx} has no point to split') move_data.at[idx, label_new_tid] = curr_tid count += 1 else: tids = np.empty(filter_.shape[0], dtype=np.int64) tids.fill(curr_tid) for i, has_problem in enumerate(filter_): if has_problem: curr_tid += 1 tids[i:] = curr_tid count += tids.shape[0] move_data.at[idx, label_new_tid] = tids return curr_tid, count
def bbox_split(bbox: tuple[int, int, int, int], number_grids: int) -> DataFrame: """ Splits the bounding box in N grids of the same size. Parameters ---------- bbox: tuple Tuple of 4 elements, containing the minimum and maximum values of latitude and longitude of the bounding box. number_grids: int Determines the number of grids to split the bounding box. Returns ------- DataFrame Returns the latitude and longitude coordinates of the grids after the split. """ lat_min = bbox[0] lon_min = bbox[1] lat_max = bbox[2] lon_max = bbox[3] const_lat = abs(abs(lat_max) - abs(lat_min)) / number_grids const_lon = abs(abs(lon_max) - abs(lon_min)) / number_grids logger.debug(f'const_lat: {const_lat}\nconst_lon: {const_lon}') move_data = pd.DataFrame( columns=['lat_min', 'lon_min', 'lat_max', 'lon_max']) for i in range(number_grids): move_data = move_data.append( { 'lat_min': lat_min, 'lon_min': lon_min + (const_lon * i), 'lat_max': lat_max, 'lon_max': lon_min + (const_lon * (i + 1)), }, ignore_index=True, ) return move_data
def create_or_update_gps_deactivated_signal( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', max_time_between_adj_points: float = 7200, new_label: str = DEACTIVATED, inplace: bool = False ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Creates a new feature that inform if point invalid. If the max time between adjacent points is equal or less than max_time_between_adj_points. Parameters ---------- move_data: dataframe The input trajectories data. max_time_between_adj_points: float, optional The max time between adjacent points, by default 7200 new_label: string, optional The name of the new feature with detected deactivated signals, by default DEACTIVATED inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame DataFrame with the additional features or None 'time_to_prev', 'time_to_next', 'time_prev_to_next', 'deactivate_signal' """ if not inplace: move_data = move_data.copy() message = 'Create or update deactivated signal if time max > %s seconds\n' logger.debug(message % max_time_between_adj_points) move_data.generate_time_features() return _process_simple_filter(move_data, new_label, TIME_TO_PREV, max_time_between_adj_points, inplace)
def create_or_update_gps_jump( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', max_dist_between_adj_points: float = 3000, new_label: str = JUMP, inplace: bool = False ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Creates a new feature that inform if point is a gps jump. A jump is defined if the maximum distance between adjacent points is greater than max_dist_between_adj_points. Parameters ---------- move_data: dataframe The input trajectories data. max_dist_between_adj_points: float, optional The maximum distance between adjacent points, by default 3000 new_label: string, optional The name of the new feature with detected deactivated signals, by default GPS_JUMP inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame DataFrame with the additional features or None 'dist_to_prev', 'dist_to_next', 'dist_prev_to_next', 'jump' """ if not inplace: move_data = move_data.copy() message = 'Create or update jump if dist max > %s meters\n' logger.debug(message % max_dist_between_adj_points) move_data.generate_dist_features() return _process_simple_filter(move_data, new_label, DIST_TO_PREV, max_dist_between_adj_points, inplace)
def clean_id_by_time_max( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_id: str = TRAJ_ID, time_max: float = 3600, label_dtype: Callable = np.float64, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Clears GPS points with time by ID greater than a user-defined limit. Parameters ---------- move_data: dataframe. The input data. label_id: str, optional The label of the column which contains the id of the trajectories, by default TRAJ_ID time_max: float, optional Indicates the maximum value time a set of points with the same id should have in order not to be dropped, by default 3600 label_dtype : type, optional Represents column id type, by default np.float64. inplace : boolean, optional if set to true the operation is done in place, the original dataframe will be altered and None is returned, by default False Returns ------- dataframe or None The filtered trajectories with the maximum time. """ if not inplace: move_data = move_data.copy() if TIME_TO_PREV not in move_data: move_data.generate_dist_time_speed_features(label_id=label_id, label_dtype=label_dtype) logger.debug('\nClean gps points with time max by id < %s seconds' % time_max) move_dataid_drop = (move_data.groupby([label_id], as_index=False).agg({ TIME_TO_PREV: 'sum' }).query(f'{TIME_TO_PREV} < {time_max}')) logger.debug( '...Ids total: %s\nIds to drop:%s' % (move_data[label_id].nunique(), move_dataid_drop[label_id].nunique())) if move_dataid_drop.shape[0] > 0: before_drop = move_data.shape[0] filter_ = move_data[label_id].isin(move_dataid_drop[label_id]) idx = move_data[filter_].index move_data.drop(idx, inplace=True) logger.debug('...Rows before drop: %s\n Rows after drop: %s' % (before_drop, move_data.shape[0])) if not inplace: return move_data
def create_update_index_grid_feature(self, data: DataFrame, unique_index: bool = True, label_dtype: Callable = np.int64, sort: bool = True): """ Create or update index grid feature. It is not necessary pass dic_grid, because it creates a dic_grid if not provided. Parameters ---------- data : DataFrame Represents the dataset with contains lat, long and datetime. unique_index: bool, optional How to index the grid, by default True label_dtype : Callable, optional Represents the type of a value of new column in dataframe, by default np.int64 sort : bool, optional Represents if needs to sort the dataframe, by default True """ operation = begin_operation('create_update_index_grid_feature') logger.debug('\nCreating or updating index of the grid feature..\n') if sort: data.sort_values([TRAJ_ID, DATETIME], inplace=True) lat_, lon_ = self.point_to_index_grid(data[LATITUDE], data[LONGITUDE]) lat_, lon_ = label_dtype(lat_), label_dtype(lon_) dict_grid = self.get_grid() if unique_index: data[INDEX_GRID] = lon_ * dict_grid['grid_size_lat_y'] + lat_ else: data[INDEX_GRID_LAT] = lat_ data[INDEX_GRID_LON] = lon_ self.last_operation = end_operation(operation)
def _end_create_operation(move_data: DataFrame, new_label: str, inplace: bool) -> DataFrame | None: """ Returns the dataframe after create operation. Parameters ---------- move_data: dataframe The input trajectories data. new_label: string The name of the new feature with detected deactivated signals. inplace : boolean if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned. Returns ------- DataFrame DataFrame with the additional features or None """ logger.debug(move_data[new_label].value_counts()) if not inplace: return move_data
def create_or_update_out_of_the_bbox( move_data: DataFrame, bbox: tuple[int, int, int, int], new_label: str = OUT_BBOX, inplace: bool = False) -> DataFrame | None: """ Create or update a boolean feature to detect points out of the bbox. Parameters ---------- move_data: dataframe The input trajectories data. bbox : tuple Tuple of 4 elements, containing the minimum and maximum values of latitude and longitude of the bounding box. new_label: string, optional The name of the new feature with detected points out of the bbox, by default OUT_BBOX inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame Returns dataframe with a boolean feature with detected points out of the bbox, or None Raises ------ ValueError If feature generation fails """ if not inplace: move_data = move_data.copy() logger.debug( '\nCreate or update boolean feature to detect points out of the bbox') filtered_ = filters.by_bbox(move_data, bbox, filter_out=True) if filtered_ is None: raise ValueError('Filter bbox failed!') logger.debug('...Creating a new label named as %s' % new_label) move_data[new_label] = False if filtered_.shape[0] > 0: logger.debug('...Setting % as True\n' % new_label) move_data.at[filtered_.index, new_label] = True return _end_create_operation(move_data, new_label, inplace)
def _clean_gps(move_data: DataFrame, f: Callable, **kwargs): """ Cleans gps points from a dataframe using condition from given function. Parameters ---------- move_data : dataframe Dataframe to be filtered. f : function Filtering function **kwargs : arguments - arg1 : feature - arg2 : value - outliers : special behavior if cleaning by outliers Returns ------- dataframe Filtered dataframe. """ if move_data.index.name is not None: logger.debug('...Reset index for filtering\n') move_data.reset_index(inplace=True) filter_data_points, rows_to_drop = _filter_data(move_data, f, kwargs) sum_drop = 0 while rows_to_drop > 0: logger.debug('...Dropping %s rows of gps points\n' % rows_to_drop) shape_before = move_data.shape[0] move_data.drop(index=filter_data_points.index, inplace=True) sum_drop = sum_drop + rows_to_drop logger.debug('...Rows before: %s, Rows after:%s, Sum drop:%s\n' % (shape_before, move_data.shape[0], sum_drop)) filter_data_points, rows_to_drop = _filter_data(move_data, f, kwargs) logger.debug('%s GPS points were dropped' % sum_drop) return move_data
def create_or_update_move_stop_by_dist_time( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', dist_radius: float = 30, time_radius: float = 900, label_id: str = TRAJ_ID, new_label: str = SEGMENT_STOP, inplace: bool = False ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Determines the stops and moves points of the dataframe. If these points already exist, they will be updated. Parameters ---------- move_data : dataframe The input trajectory data dist_radius : float, optional The first step in this function is segmenting the trajectory The segments are used to find the stop points The dist_radius defines the distance used in the segmentation, by default 30 time_radius : float, optional The time_radius used to determine if a segment is a stop If the user stayed in the segment for a time greater than time_radius, than the segment is a stop, by default 900 label_id : str, optional Indicates the label of the id column in the user dataframe, by default TRAJ_ID new_label : float, optional Is the name of the column to indicates if a point is a stop of a move, by default SEGMENT_STOP inplace : bool, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame DataFrame with 2 aditional features: segment_stop and stop. segment_stop indicates the trajectory segment to which the point belongs stop indicates if the point represents a stop. """ if not inplace: move_data = move_data.copy() by_max_dist( move_data, label_id=label_id, max_dist_between_adj_points=dist_radius, label_new_tid=new_label, inplace=True ) move_data.generate_dist_time_speed_features( label_id=new_label ) logger.debug('Create or update stop as True or False') logger.debug( '...Creating stop features as True or False using %s to time in seconds' % time_radius ) move_data[STOP] = False move_dataagg_tid = ( move_data.groupby(by=new_label) .agg({TIME_TO_PREV: 'sum'}) .query(f'{TIME_TO_PREV} > {time_radius}') .index ) idx = move_data[ move_data[new_label].isin(move_dataagg_tid) ].index move_data.at[idx, STOP] = True logger.debug(move_data[STOP].value_counts()) if not inplace: return move_data
def create_or_update_move_and_stop_by_radius( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', radius: float = 0, target_label: str = DIST_TO_PREV, new_label: str = SITUATION, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Finds the stops and moves points of the dataframe. If these points already exist, they will be updated. Parameters ---------- move_data : dataframe The input trajectory data radius : float, optional The radius value is used to determine if a segment is a stop. If the value of the point in target_label is greater than radius, the segment is a stop, otherwise it's a move, by default 0 target_label : String, optional The feature used to calculate the stay points, by default DIST_TO_PREV new_label : String, optional Is the name of the column to indicates if a point is a stop of a move, by default SITUATION inplace : bool, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame dataframe with 2 aditional features: segment_stop and new_label. segment_stop indicates the trajectory segment to which the point belongs new_label indicates if the point represents a stop or moving point. """ logger.debug('\nCreating or updating features MOVE and STOPS...\n') if not inplace: move_data = move_data.copy() if DIST_TO_PREV not in move_data: move_data.generate_dist_features() conditions = ( (move_data[target_label] > radius), (move_data[target_label] <= radius), ) choices = [MOVE, STOP] move_data[new_label] = np.select(conditions, choices, np.nan) logger.debug( '\n....There are %s stops to this parameters\n' % (move_data[move_data[new_label] == STOP].shape[0]) ) if not inplace: return move_data
def by_dist_time_speed( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_id: str = TRAJ_ID, max_dist_between_adj_points: float = 3000, max_time_between_adj_points: float = 900, max_speed_between_adj_points: float = 50.0, drop_single_points: bool = True, label_new_tid: str = TID_PART, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Splits the trajectories into segments based on distance, time and speed. Parameters ---------- move_data : dataframe The input trajectory data label_id : str, optional Indicates the label of the id column in the user dataframe, by default TRAJ_ID max_dist_between_adj_points : float, optional Specify the maximum distance a point should have from the previous point, in order not to be dropped, by default 3000 max_time_between_adj_points : float, optional Specify the maximum travel time between two adjacent points, by default 900 max_speed_between_adj_points : float, optional Specify the maximum speed of travel between two adjacent points, by default 50 drop_single_points : boolean, optional If set to True, drops the trajectories with only one point, by default True label_new_tid : str, optional The label of the column containing the ids of the formed segments. Is the new splitted id, by default TID_PART inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame DataFrame with the aditional features: label_new_tid, that indicates the trajectory segment to which the point belongs to, by default False Note ---- Time, distance and speed features must be updated after split. """ if not inplace: move_data = move_data.copy() logger.debug('\nSplit trajectories') logger.debug('...max_dist_between_adj_points: {}'.format( max_dist_between_adj_points)) logger.debug('...max_time_between_adj_points: {}'.format( max_time_between_adj_points)) logger.debug('...max_speed_between_adj_points: {}'.format( max_speed_between_adj_points)) if TIME_TO_PREV not in move_data: move_data.generate_dist_time_speed_features() move_data = _filter_by(move_data, label_id, label_new_tid, drop_single_points, max_dist=max_dist_between_adj_points, max_time=max_time_between_adj_points, max_speed=max_speed_between_adj_points, all=True) if not inplace: return move_data
def _filter_by(move_data: DataFrame, label_id: str, label_new_tid: str, drop_single_points: bool, **kwargs) -> DataFrame: """ Splits the trajectories into segments. Parameters ---------- move_data : dataframe The input trajectory data label_id : str, optional Indicates the label of the id column in the user dataframe, by default TRAJ_ID label_new_tid : str, optional(TID_PART by default) The label of the column containing the ids of the formed segments. Is the new splitted id. drop_single_points : boolean, optional(True by default) If set to True, drops the trajectories with only one point. **kwargs : arguments depends on the type of segmentation - all : if is a segmentation by all features - max_dist : maximum dist between adjacent points - max_time : maximum time between adjacent points - max_speed : maximum speed between adjacent points - feature : feature to use for segmentation - max_between_adj_points : maximum value for feature Returns ------- dataframe DataFrame with the aditional features: label_new_tid, that indicates the trajectory segment to which the point belongs to. Note ---- Time, distance and speed features must be updated after split. """ curr_tid, ids, count = _prepare_segmentation(move_data, label_id, label_new_tid) for idx in progress_bar(ids, desc='Generating %s' % label_new_tid): if kwargs['all']: filter_ = _filter_and_dist_time_speed(move_data, idx, kwargs['max_dist'], kwargs['max_time'], kwargs['max_speed']) else: filter_ = _filter_or_dist_time_speed( move_data, idx, kwargs['feature'], kwargs['max_between_adj_points']) curr_tid, count = _update_curr_tid_count(filter_, move_data, idx, label_new_tid, curr_tid, count) if label_id == label_new_tid: move_data.reset_index(drop=True, inplace=True) logger.debug( '... label_tid = label_new_id, then reseting and drop index') else: move_data.reset_index(inplace=True) logger.debug('... Reseting index\n') if drop_single_points: _drop_single_point(move_data, label_new_tid, label_id) move_data.generate_dist_time_speed_features() return move_data
def compress_segment_stop_to_point( move_data: DataFrame, label_segment: str = SEGMENT_STOP, label_stop: str = STOP, point_mean: str = 'default', drop_moves: bool = False, label_id: str = TRAJ_ID, dist_radius: float = 30, time_radius: float = 900, inplace: bool = False, ) -> DataFrame: """ Compress the trajectories using the stop points in the dataframe. Compress a segment to point setting lat_mean e lon_mean to each segment. Parameters ---------- move_data : dataframe The input trajectory data label_segment : String, optional The label of the column containing the ids of the formed segments. Is the new splitted id, by default SEGMENT_STOP label_stop : String, optional Is the name of the column that indicates if a point is a stop, by default STOP point_mean : String, optional Indicates whether the mean points should be calculated using centroids or the point that repeat the most, by default 'default' drop_moves : Boolean, optional If set to true, the moving points will be dropped from the dataframe, by default False label_id : String, optional Used to create the stay points used in the compression. If the dataset already has the stop move, this parameter should be ignored. Indicates the label of the id column in the user dataframe, by default TRAJ_ID dist_radius : Double, optional Used to create the stay points used in the compression, by default 30 If the dataset already has the stop move, this parameter should be ignored. The first step in this function is segmenting the trajectory. The segments are used to find the stop points. The dist_radius defines the distance used in the segmentation. time_radius : Double, optional Used to create the stay points used in the compression, by default 900 If the dataset already has the stop move, this parameter should be ignored. The time_radius used to determine if a segment is a stop. If the user stayed in the segment for a time greater than time_radius, than the segment is a stop. inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame Data with 3 additional features: segment_stop, lat_mean and lon_mean or None segment_stop indicates the trajectory segment to which the point belongs lat_mean and lon_mean: if the default option is used, lat_mean and lon_mean are defined based on point that repeats most within the segment On the other hand, if centroid option is used, lat_mean and lon_mean are defined by centroid of the all points into segment """ if not inplace: move_data = move_data.copy() if (label_segment not in move_data) & (label_stop not in move_data): create_or_update_move_stop_by_dist_time(move_data, dist_radius, time_radius, label_id, inplace=True) logger.debug('...setting mean to lat and lon...') lat_mean = np.full(move_data.shape[0], -1.0, dtype=np.float64) lon_mean = np.full(move_data.shape[0], -1.0, dtype=np.float64) if drop_moves is False: lat_mean[move_data[~move_data[label_stop]].index] = np.NaN lon_mean[move_data[~move_data[label_stop]].index] = np.NaN else: logger.debug('...move segments will be dropped...') logger.debug('...get only segments stop...') segments = move_data[move_data[label_stop]][label_segment].unique() for idx in progress_bar( segments, desc=f'Generating {label_segment} and {label_stop}'): filter_ = move_data[label_segment] == idx size_id = move_data[filter_].shape[0] # verify if filter is None if size_id > 1: # get first and last point of each stop segment ind_start = move_data[filter_].iloc[[0]].index ind_end = move_data[filter_].iloc[[-1]].index if point_mean == 'default': p = (move_data[filter_].groupby([LATITUDE, LONGITUDE], as_index=False).agg({ 'id': 'count' }).sort_values(['id']).tail(1)) lat_mean[ind_start] = p.iloc[0, 0] lon_mean[ind_start] = p.iloc[0, 1] lat_mean[ind_end] = p.iloc[0, 0] lon_mean[ind_end] = p.iloc[0, 1] elif point_mean == 'centroid': # set lat and lon mean to first_point # and last points to each segment lat_mean[ind_start] = move_data.loc[filter_][LATITUDE].mean() lon_mean[ind_start] = move_data.loc[filter_][LONGITUDE].mean() lat_mean[ind_end] = move_data.loc[filter_][LATITUDE].mean() lon_mean[ind_end] = move_data.loc[filter_][LONGITUDE].mean() else: logger.debug(f'There are segments with only one point: {idx}') move_data[LAT_MEAN] = lat_mean move_data[LON_MEAN] = lon_mean del lat_mean del lon_mean shape_before = move_data.shape[0] # filter points to drop filter_drop = ((move_data[LAT_MEAN] == -1.0) & (move_data[LON_MEAN] == -1.0)) shape_drop = move_data[filter_drop].shape[0] if shape_drop > 0: logger.debug('...Dropping %s points...' % shape_drop) move_data.drop(move_data[filter_drop].index, inplace=True) logger.debug('...Shape_before: %s\n...Current shape: %s' % (shape_before, move_data.shape[0])) if not inplace: return move_data
def knn_query( traj: DataFrame, move_df: DataFrame, k: int = 5, id_: str = TRAJ_ID, distance: str = MEDP, latitude: str = LATITUDE, longitude: str = LONGITUDE, datetime: str = DATETIME ) -> DataFrame: """ Returns the k neighboring trajectories closest to the trajectory. Given a k, a trajectory and a DataFrame with multiple paths. Parameters ---------- traj: dataframe The input of one trajectory. move_df: dataframe The input trajectory data. k: int, optional neighboring trajectories, by default 5 id_: str, optional Label of the trajectories dataframe user id, by default TRAJ_ID distance: string, optional Distance measure type, by default MEDP latitude: string, optional Label of the trajectories dataframe referring to the latitude, by default LATITUDE longitude: string, optional Label of the trajectories dataframe referring to the longitude, by default LONGITUDE datetime: string, optional Label of the trajectories dataframe referring to the timestamp, by default DATETIME Returns ------- DataFrame dataframe with near trajectories Raises ------ ValueError: if distance measure is invalid Examples -------- >>> from pymove.query.query import knn_query >>> traj_df lat lon datetime id 0 16.4 -54.9 2014-10-11 18:00:00 1 1 16.4 -55.9 2014-10-12 00:00:00 1 2 16.4 -56.9 2014-10-12 06:00:00 1 >>> move_df lat lon datetime id 0 33.1 -77.0 2012-05-19 00:00:00 2 1 32.8 -77.1 2012-05-19 06:00:00 3 2 32.5 -77.3 2012-05-19 12:00:00 4 >>> knn_query( >>> traj_df, move_df, k=1 >>> ) lat lon datetime id 0 16.4 -54.9 2014-10-11 18:00:00 1 1 16.4 -55.9 2014-10-12 00:00:00 1 2 16.4 -56.9 2014-10-12 06:00:00 1 2 32.5 -77.3 2012-05-19 12:00:00 4 """ k_list = pd.DataFrame([[np.Inf, 'empty']] * k, columns=['distance', TRAJ_ID]) if (distance == MEDP): def dist_measure(traj, this, latitude, longitude, datetime): return distances.medp( traj, this, latitude, longitude ) elif (distance == MEDT): def dist_measure(traj, this, latitude, longitude, datetime): return distances.medt( traj, this, latitude, longitude, datetime ) else: raise ValueError('Unknown distance measure. Use MEDP or MEDT') for traj_id in progress_bar( move_df[id_].unique(), desc=f'Querying knn by {distance}' ): if (traj_id != traj[id_].values[0]): this = move_df.loc[move_df[id_] == traj_id] this_distance = dist_measure( traj, this, latitude, longitude, datetime ) n = 0 for n in range(k): if (this_distance < k_list.loc[n, 'distance']): k_list.loc[n, 'distance'] = this_distance k_list.loc[n, 'traj_id'] = traj_id break n = n + 1 result = traj.copy() logger.debug('Generating DataFrame with k nearest trajectories.') for n in range(k): result = result.append( move_df.loc[move_df[id_] == k_list.loc[n, 'traj_id']] ) return result