def create_all_polygons_on_grid(self): """ Create all polygons that are represented in a grid and store them in a new dic_grid key . """ operation = begin_operation('create_all_polygons_on_grid') try: print('\nCreating all polygons on virtual grid', flush=True) grid_polygon = np.array( [[None for i in range(self.grid_size_lon_x)] for j in range(self.grid_size_lat_y)]) lat_init = self.lat_min_y cell_size = self.cell_size_by_degree for i in progress_bar(range(self.grid_size_lat_y)): lon_init = self.lon_min_x for j in range(self.grid_size_lon_x): # Cria o polygon da célula grid_polygon[i][j] = Polygon(( (lat_init, lon_init), (lat_init + cell_size, lon_init), (lat_init + cell_size, lon_init + cell_size), (lat_init, lon_init + cell_size), )) lon_init += cell_size lat_init += cell_size self.grid_polygon = grid_polygon print('...geometries saved on Grid grid_polygon property') self.last_operation = end_operation(operation) except Exception as e: self.last_operation = end_operation(operation) raise e
def insert_points_in_df(data: DataFrame, aug_df: DataFrame): """ Inserts the points of the generated trajectories to the original data sets. Parameters ---------- data : DataFrame The input trajectories data aug_df : DataFrame The data of unobserved trajectories """ for _, row in progress_bar(aug_df.iterrows(), total=aug_df.shape[0]): keys = row.index.tolist() values = row.values.tolist() row_df = pd.DataFrame() for k, v in zip(keys, values): if k in data: if isinstance(v, list) or isinstance(v, np.ndarray): row_df[k] = v for k, v in zip(keys, values): if k in data: if not isinstance(v, list) and not isinstance(v, np.ndarray): row_df[k] = v for _, row_ in row_df.iterrows(): append_row(data, row=row_)
def decode_geohash_to_latlon(data: DataFrame, label_geohash: Optional[Text] = GEOHASH, reset_index: Optional[bool] = True): """ Decode feature with hash of trajectories back to geographic coordinates. Parameters ---------- data : dataframe The input trajectories data label_geohash : str, optional The name of the feature with hashed trajectories, by default GEOHASH reset_index : boolean, optional Condition to reset the df index, by default True """ if label_geohash not in data: raise ValueError('feature {} not in df'.format(label_geohash)) lat, lon, _, _ = _reset_and_create_arrays_none(data, reset_index=reset_index) for idx, row in progress_bar(data[[label_geohash]].iterrows(), total=data.shape[0]): lat_lon = _decode(row[label_geohash]) lat[idx] = lat_lon[0] lon[idx] = lat_lon[1] data[LATITUDE_DECODE] = lat data[LONGITUDE_DECODE] = lon print('\n================================================') print('\n==> lat and lon decode features was created. <==') print('\n================================================')
def join_collective_areas(gdf_: DataFrame, gdf_rules_: DataFrame, label_geometry: Optional[Text] = GEOMETRY): """ It performs the integration between trajectories and collective areas, generating a new column that informs if the point of the trajectory is inserted in a collective area. Parameters ---------- gdf_ : geopandas.GeoDataFrame The input trajectory data gdf_rules_ : geopandas.GeoDataFrame The input coletive areas data label_geometry : str, optional Label referring to the Point of Interest category, by default GEOMETRY """ print('Integration between trajectories and collectives areas') polygons = gdf_rules_[label_geometry].unique() gdf_[VIOLATING] = False for p in progress_bar(polygons): # intersects = gdf_[label_geometry].apply(lambda x: x.intersects(p)) intersects = gdf_[label_geometry].intersects(p) index = gdf_[intersects].index gdf_.at[index, VIOLATING] = True
def create_all_polygons_on_grid(self): """ Create all polygons that are represented in a grid. Stores the polygons in the `grid_polygon` key """ operation = begin_operation('create_all_polygons_on_grid') logger.debug('\nCreating all polygons on virtual grid') grid_polygon = np.array([[None for _ in range(self.grid_size_lon_x)] for _ in range(self.grid_size_lat_y)]) lat_init = self.lat_min_y cell_size = self.cell_size_by_degree for i in progress_bar(range(self.grid_size_lat_y), desc='Creating polygons'): lon_init = self.lon_min_x for j in range(self.grid_size_lon_x): # Cria o polygon da célula grid_polygon[i][j] = Polygon( ((lon_init, lat_init), (lon_init, lat_init + cell_size), (lon_init + cell_size, lat_init + cell_size), (lon_init + cell_size, lat_init))) lon_init += cell_size lat_init += cell_size self.grid_polygon = grid_polygon logger.debug('...geometries saved on Grid grid_polygon property') self.last_operation = end_operation(operation)
def join_with_pois(data: DataFrame, df_pois: DataFrame, label_id: Optional[Text] = TRAJ_ID, label_poi_name: Optional[Text] = NAME_POI, reset_index: Optional[Text] = True): """ Performs the integration between trajectories and points of interest, generating two new columns referring to the name and the distance from the point of interest closest to each point of the trajectory. Parameters ---------- data : DataFrame The input trajectory data. df_pois : DataFrame The input point of interest data. label_id : str, optional Label of df_pois referring to the Point of Interest id, by default TRAJ_ID label_poi_name : str, optional Label of df_pois referring to the Point of Interest name, by default NAME_POI reset_index : bool, optional Flag for reset index of the df_pois and data dataframes before the join, by default True """ print('Integration with POIs...') values = _reset_and_creates_id_and_lat_lon(data, df_pois, True, reset_index) current_distances, ids_POIs, tag_POIs, lat_user, lon_user = values for idx, row in progress_bar(data.iterrows(), total=len(data)): # create a vector to each lat lat_user.fill(row[LATITUDE]) lon_user.fill(row[LONGITUDE]) # computing distances to idx distances = np.float64( haversine( lat_user, lon_user, df_pois[LATITUDE].values, df_pois[LONGITUDE].values, )) # get index to arg_min and min distance index_min = np.argmin(distances) current_distances[idx] = np.min(distances) # setting data for a single object movement ids_POIs[idx] = df_pois.at[index_min, label_id] tag_POIs[idx] = df_pois.at[index_min, label_poi_name] data[ID_POI] = ids_POIs data[DIST_POI] = current_distances data[NAME_POI] = tag_POIs print('Integration with POI was finalized')
def join_collective_areas(gdf_, gdf_rules_, label_geometry=GEOMETRY): """ It performs the integration between trajectories and collective areas, generating a new column that informs if the point of the trajectory is inserted in a collective area. Parameters ---------- gdf_ : geopandas.GeoDataFrame The input trajectory data gdf_rules_ : geopandas.GeoDataFrame The input coletive areas data label_geometry: String, optional("geometry" by default) Label of gdf_rules_ referring to the geometry of each feature """ print('Integration between trajectories and collectives areas') polygons = gdf_rules_[label_geometry].unique() gdf_[VIOLATING] = False for p in progress_bar(polygons): index = gdf_[gdf_[label_geometry].intersects(p)].index gdf_.at[index, VIOLATING] = True
def create_bin_geohash_df(df_, precision=15): """ Create trajectory geohash binaries and integrate with df. Parameters ---------- df_ : dataframe The input trajectories data. precision : number, optional, default 15 Number of characters in resulting geohash. """ try: _, _, _, bin_geohash = _reset_and_create_arrays_none(df_) for idx, row in progress_bar(df_[[LATITUDE, LONGITUDE]].iterrows(), total=df_.shape[0]): bin_geohash[idx] = _bin_geohash(row[LATITUDE], row[LONGITUDE], precision) df_[BIN_GEOHASH] = bin_geohash print('\n================================================') print('\n=====> bin_geohash features was created. <======') print('\n================================================') except Exception as e: raise e
def decode_geohash_to_latlon(df_, label_geohash=GEOHASH, reset_index=True): """ Decode feature with hash of trajectories back to geographic coordinates. Parameters ---------- df_ : dataframe The input trajectories data. label_geohash : str, optional, default 'geohash' The name of the feature with hashed trajectories reset_index : boolean, optional, default True Condition to reset the df index. """ try: if label_geohash not in df_: raise ValueError('feature {} not in df'.format(label_geohash)) lat, lon, _, _ = _reset_and_create_arrays_none(df_, reset_index=reset_index) for idx, row in progress_bar(df_[[label_geohash]].iterrows(), total=df_.shape[0]): lat_lon = _decode(row[label_geohash]) lat[idx] = lat_lon[0] lon[idx] = lat_lon[1] df_[LATITUDE_DECODE] = lat df_[LONGITUDE_DECODE] = lon print('\n================================================') print('\n==> lat and lon decode features was created. <==') print('\n================================================') except Exception as e: raise e
def column_to_array(df_, label_conversion): """ Transforms all columns values to list. Parameters ---------- df_ : dataframe The input trajectory data. label_conversion : Object Label of df_ referring to the column for conversion. """ try: if label_conversion not in df_: raise KeyError('Dataframe must contain a %s column' % label_conversion) arr = np.full(df_.shape[0], None, dtype=np.ndarray) for idx, row in progress_bar(df_.iterrows(), total=df_.shape[0]): arr[idx] = object_for_array(row[label_conversion]) df_[label_conversion] = arr except Exception as e: raise e
def gap_statistic(move_data, nrefs=3, k_initial=1, max_clusters=15, k_iteration=1): """ Calculates optimal clusters numbers using Gap Statistic from Tibshirani, Walther, Hastie. Parameters ---------- move_data: ndarry of shape (n_samples, n_features). The input trajectory data. nrefs: int, optional (3 by default). number of sample reference datasets to create k_initial: int, optional (1 by default). The initial value used in the interaction of the elbow method. Represents the maximum numbers of clusters. max_clusters: int, optional (15 by default). Maximum number of clusters to test for. k_iteration:int, optional (1 by default). Increment value of the sequence used by the elbow method. Returns ------- dict The error value for each cluster number Notes ----- https://anaconda.org/milesgranger/gap-statistic/notebook """ message = 'Executing Gap Statistic to:\n...K of %srs to %srs from k_iteration:%srs\n' message = message % (k_initial, max_clusters, k_iteration) print(message, flush=True) gaps = {} for k in progress_bar(range(k_initial, max_clusters + 1, k_iteration)): # Holder for reference dispersion results ref_disps = np.zeros(nrefs) # For n references, generate random sample and perform kmeans # getting resulting dispersion of each loop for i in range(nrefs): # Create new random reference set random_reference = np.random.random_sample(size=move_data.shape) # Fit to it km = KMeans(k) km.fit(random_reference) ref_disps[i] = km.inertia_ # Fit cluster to original data and create dispersion km = KMeans(k).fit(move_data) orig_disp = km.inertia_ # Calculate gap statistic gap = np.log(np.mean(ref_disps)) - np.log(orig_disp) # Assign this loop'srs gap statistic to gaps gaps[k] = gap return gaps
def generate_trajectories_df( data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_tid: Text = TID, min_points_traj: int = 3 ) -> DataFrame: """ Generates a dataframe with the sequence of location points of a trajectory. Parameters ---------- data : DataFrame The input trajectory data. label_tid: String, optional Label referring to the ID of the trajectories, by default TID min_points_traj: Number, optional Minimum points per trajectory, by default 3 Return ------ DataFrame DataFrame of the trajectories Example ------- >>> from pymove.utils.data_augmentation import generate_trajectories_df >>> >>> df id datetime local lat lon tid 0 1 2017-09-02 21:59:34 162 -3.8431323 -38.5933142 12017090221 1 1 2017-09-02 22:00:27 85 -3.8347478 -38.5921890 12017090222 2 1 2017-09-02 22:01:36 673 -3.8235834 -38.5903890 12017090222 3 1 2017-09-02 22:03:08 394 -3.8138890 -38.5904445 12017090222 4 1 2017-09-02 22:03:46 263 -3.9067654 -38.5907723 12017090222 5 1 2017-09-02 22:07:19 224 -3.8857223 -38.5928892 12017090222 6 1 2017-09-02 22:07:40 623 -3.8828723 -38.5929789 12017090222 >>> >>> traj_df = generate_trajectories_df(df) >>> traj_df.local 0 [85, 673, 394, 263, 224, 623] Name: local, dtype: object """ if label_tid not in data: raise ValueError( '{} not in DataFrame'.format(label_tid) ) frames = [] tids = data[label_tid].unique() desc = 'Gererating Trajectories DataFrame' for tid in progress_bar(tids, desc=desc, total=len(tids)): frame = data[data[label_tid] == tid] if frame.shape[0] >= min_points_traj: frames.append(frame.T.values.tolist()) return pd.DataFrame(frames, columns=data.columns)
def dbscan_clustering(move_data: DataFrame, cluster_by: str, meters: int = 10, min_sample: float = 1680 / 2, earth_radius: float = EARTH_RADIUS, metric: str | Callable = 'euclidean', inplace: bool = False) -> DataFrame | None: """ Performs density based clustering on the move_dataframe according to cluster_by. Parameters ---------- move_data : dataframe the input trajectory cluster_by : str the colum to cluster meters : int, optional distance to use in the clustering, by default 10 min_sample : float, optional the minimum number of samples to consider a cluster, by default 1680/2 earth_radius : int Y offset from your original position in meters, by default EARTH_RADIUS metric: string, or callable, optional The metric to use when calculating distance between instances in a feature array by default 'euclidean' inplace : bool, optional Whether to return a new DataFrame, by default False Returns ------- DataFrame Clustered dataframe or None """ if not inplace: move_data = move_data[:] move_data.reset_index(drop=True, inplace=True) move_data[N_CLUSTER] = -1 for cluster_id in progress_bar(move_data[cluster_by].unique(), desc='Clustering'): df_filter = move_data[move_data[cluster_by] == cluster_id] dbscan = DBSCAN(eps=meters_to_eps(meters, earth_radius), min_samples=min_sample, metric=metric) dbscan_result = dbscan.fit(df_filter[[LATITUDE, LONGITUDE]].to_numpy()) idx = df_filter.index res = dbscan_result.labels_ + move_data[N_CLUSTER].max() + 1 move_data.at[idx, N_CLUSTER] = res if not inplace: return move_data
def decode_geohash_to_latlon(data: DataFrame, label_geohash: str = GEOHASH, reset_index: bool = True): """ Decode feature with hash of trajectories back to geographic coordinates. Parameters ---------- data : dataframe The input trajectories data label_geohash : str, optional The name of the feature with hashed trajectories, by default GEOHASH reset_index : boolean, optional Condition to reset the df index, by default True Return ------ A DataFrame with the additional columns 'lat_decode' and 'lon_decode' Example ------- >>> from pymove.utils.geoutils import decode_geohash_to_latlon >>> geoLife_df lat lon geohash 0 39.984094 116.319236 wx4eqyvh4xkg0xs 1 39.984198 116.319322 wx4eqyvhudszsev 2 39.984224 116.319402 wx4eqyvhyx8d9wc 3 39.984211 116.319389 wx4eqyvhyjnv5m7 4 39.984217 116.319422 wx4eqyvhyyr2yy8 >>> print(type(decode_geohash_to_latlon(geoLife_df))) >>> geoLife_df <class 'NoneType'> lat lon geohash lat_decode lon_decode 0 39.984094 116.319236 wx4eqyvh4xkg0xs 39.984094 116.319236 1 39.984198 116.319322 wx4eqyvhudszsev 39.984198 116.319322 2 39.984224 116.319402 wx4eqyvhyx8d9wc 39.984224 116.319402 3 39.984211 116.319389 wx4eqyvhyjnv5m7 39.984211 116.319389 4 39.984217 116.319422 wx4eqyvhyyr2yy8 39.984217 116.319422 """ if label_geohash not in data: raise ValueError(f'feature {label_geohash} not in df') lat, lon, _, _ = _reset_and_create_arrays_none(data, reset_index=reset_index) for idx, row in progress_bar(data[[label_geohash]].iterrows(), total=data.shape[0]): lat_lon = _decode(row[label_geohash]) lat[idx] = lat_lon[0] lon[idx] = lat_lon[1] data[LATITUDE_DECODE] = lat data[LONGITUDE_DECODE] = lon
def elbow_method(move_data: DataFrame, k_initial: int = 1, max_clusters: int = 15, k_iteration: int = 1, random_state: int | None = None) -> dict: """ Determines the optimal number of clusters. In the range set by the user using the elbow method. Parameters ---------- move_data : dataframe The input trajectory data. k_initial: int, optional The initial value used in the interaction of the elbow method. Represents the maximum numbers of clusters, by default 1 max_clusters: int, optional The maximum value used in the interaction of the elbow method. Maximum number of clusters to test for, by default 15 k_iteration: int, optional Increment value of the sequence used by the elbow method, by default 1 random_state: int, RandomState instance Determines random number generation for centroid initialization. Use an int to make the randomness deterministic, by default None Returns ------- dict The inertia values for the different numbers of clusters Example ------- clustering.elbow_method(move_data=move_df, k_iteration=3) { 1: 55084.15957839036, 4: 245.68365592382938, 7: 92.31472644640075, 10: 62.618599956870355, 13: 45.59653757292055, } """ message = 'Executing Elbow Method for {} to {} clusters at {} steps\n'.format( k_initial, max_clusters, k_iteration) logger.debug(message) inertia_dic = {} for k in progress_bar(range(k_initial, max_clusters + 1, k_iteration), desc='Running KMeans'): km = KMeans(n_clusters=k, random_state=random_state) inertia_dic[k] = km.fit(move_data[[LATITUDE, LONGITUDE]]).inertia_ return inertia_dic
def elbow_method(move_data, k_initial=1, max_clusters=15, k_iteration=1, random_state=None): """ Determines the optimal number of clusters in the range set by the user using the elbow method. Parameters ---------- move_data : dataframe The input trajectory data. k_initial: int, optional (1 by default). The initial value used in the interaction of the elbow method. Represents the maximum numbers of clusters. max_clusters: int, optional (15 by default). The maximum value used in the interaction of the elbow method. Maximum number of clusters to test for k_iteration: int, optional (1 by default). Increment value of the sequence used by the elbow method. random_state: int, RandomState instance, default=None Determines random number generation for centroid initialization. Use an int to make the randomness deterministic Returns ------- dict The inertia values for the different numbers of clusters Example ------- clustering.elbow_method(move_data=move_df[['lat', 'lon']], k_iteration=3) { 1: 55084.15957839036, 4: 245.68365592382938, 7: 92.31472644640075, 10: 62.618599956870355, 13: 45.59653757292055, } """ message = 'Executing Elbow Method to:\n...K of %srs to %srs from k_iteration:%srs\n' message = message % (k_initial, max_clusters, k_iteration) print(message, flush=True) inertia_dic = {} for k in progress_bar(range(k_initial, max_clusters + 1, k_iteration)): km = KMeans(n_clusters=k, random_state=random_state) inertia_dic[k] = km.fit(move_data).inertia_ return inertia_dic
def generate_trajectories_df( data: Union['PandasMoveDataFrame', 'DaskMoveDataFrame'] ) -> DataFrame: """ Generates a dataframe with the sequence of location points of a trajectory. Parameters ---------- data : DataFrame The input trajectory data. Return ------ DataFrame DataFrame of the trajectories """ if TID not in data: data.generate_tid_based_on_id_datetime() data.reset_index(drop=True, inplace=True) tids = data[TID].unique() new_df = pd.DataFrame( columns=data.columns ) for tid in progress_bar(tids, total=len(tids)): filter_ = data[data[TID] == tid] filter_.reset_index(drop=True, inplace=True) if filter_.shape[0] > 4: values = [] for col in filter_.columns: if filter_[col].nunique() == 1: values.append(filter_.at[0, col]) else: values.append( np.array( filter_[col], dtype=type(filter_.at[0, col]) ).tolist() ) row = pd.Series(values, filter_.columns) append_row(new_df, row=row) return new_df
def create_all_polygons_to_all_point_on_grid(self, data, unique_index=True): """ Create all polygons to all points represented in a grid. Parameters ---------- data : pandas.core.frame.DataFrame Represents the dataset with contains lat, long and datetime. unique_index: boolean How to index the grid Returns ------- pandas.core.frame.DataFrame Represents the same dataset with new key 'polygon' where polygons were saved. """ operation = begin_operation('create_all_polygons_to_all_point_on_grid') try: self.create_update_index_grid_feature(data, unique_index=False) print(data) datapolygons = data.loc[ :, ['id', 'index_grid_lat', 'index_grid_lon'] ].drop_duplicates() size = datapolygons.shape[0] # transform series in numpyarray index_grid_lat = np.array(data['index_grid_lat']) index_grid_lon = np.array(data['index_grid_lon']) # transform series in numpyarray polygons = np.array([]) for i in progress_bar(range(size)): p = self.create_one_polygon_to_point_on_grid( index_grid_lat[i], index_grid_lon[i] ) polygons = np.append(polygons, p) print('...polygons were created') datapolygons['polygon'] = polygons self.last_operation = end_operation(operation) return datapolygons except Exception as e: self.last_operation = end_operation(operation) print('size:{}, i:{}'.format(size, i)) raise e
def create_bin_geohash_df(data: DataFrame, precision: float = 15): """ Create trajectory geohash binaries and integrate with df. Parameters ---------- data : dataframe The input trajectories data precision : float, optional Number of characters in resulting geohash, by default 15 Return ------ A DataFrame with the additional column 'bin_geohash' Example ------- >>> from pymove.utils.geoutils import create_bin_geohash_df >>> geoLife_df lat lon 0 39.984094 116.319236 1 39.984198 116.319322 2 39.984224 116.319402 3 39.984211 116.319389 4 39.984217 116.319422 >>> print(type(create_bin_geohash_df(geoLife_df))) >>> geoLife_df <class 'NoneType'> lat lon bin_geohash 0 39.984094 116.319236 [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, ... 1 39.984198 116.319322 [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, ... 2 39.984224 116.319402 [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, ... 3 39.984211 116.319389 [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, ... 4 39.984217 116.319422 [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, ... """ *_, bin_geohash = _reset_and_create_arrays_none(data) for idx, row in progress_bar(data[[LATITUDE, LONGITUDE]].iterrows(), total=data.shape[0]): bin_geohash[idx] = _bin_geohash(row[LATITUDE], row[LONGITUDE], precision) data[BIN_GEOHASH] = bin_geohash
def elbow_method(move_data, k_initial=1, max_clusters=15, k_iteration=1): """ Determines the optimal number of clusters in the range set by the user using the elbow method. Parameters ---------- move_data : dataframe The input trajectory data. k_initial: int, optional (1 by default). The initial value used in the interaction of the elbow method. Represents the maximum numbers of clusters. max_clusters: int, optional (15 by default). The maximum value used in the interaction of the elbow method. Maximum number of clusters to test for k_iteration: int, optional (1 by default). Increment value of the sequence used by the elbow method. Returns ------- inertia_dic : dictionary The inertia values for the different numbers of clusters Example ------- clustering.elbow_method(move_data=move_df[['lat', 'lon']], k_initial = 2, max_clusters = 17, k_iteration = 2) {2: 55084.15957839036, 4: 245.68365592382938, 6: 92.31472644640075, 8: 62.618599956870355, 10: 45.59653757292055, 12: 34.32238676029195, 14: 26.087387367439227, 16: 20.64369311973992} """ message = "Executing Elbow Method to:\n...K of {} to {} from k_iteration:{}\n".format( k_initial, max_clusters, k_iteration) print(message, flush=True) inertia_dic = {} for k in progress_bar(range(k_initial, max_clusters, k_iteration)): # validing K value in K-means # print('...testing k: {}'.format(k)) inertia_dic[k] = KMeans(n_clusters=k).fit(move_data).inertia_ return inertia_dic
def create_geohash_df(data: DataFrame, precision: float = 15): """ Create geohash from geographic coordinates and integrate with df. Parameters ---------- data : dataframe The input trajectories data precision : float, optional Number of characters in resulting geohash, by default 15 Return ------ A DataFrame with the additional column 'geohash' Example ------- >>> from pymove.utils.geoutils import create_geohash_df, _reset_and_create_arrays_none >>> geoLife_df lat lon 0 39.984094 116.319236 1 39.984198 116.319322 2 39.984224 116.319402 3 39.984211 116.319389 4 39.984217 116.319422 >>> print(type (create_geohash_df(geoLife_df))) >>> geoLife_df <class 'NoneType'> lat lon geohash 0 39.984094 116.319236 wx4eqyvh4xkg0xs 1 39.984198 116.319322 wx4eqyvhudszsev 2 39.984224 116.319402 wx4eqyvhyx8d9wc 3 39.984211 116.319389 wx4eqyvhyjnv5m7 4 39.984217 116.319422 wx4eqyvhyyr2yy8 """ _, _, geohash, _ = _reset_and_create_arrays_none(data) for idx, row in progress_bar(data[[LATITUDE, LONGITUDE]].iterrows(), total=data.shape[0]): geohash[idx] = _encode(row[LATITUDE], row[LONGITUDE], precision) data[GEOHASH] = geohash
def flatten_trajectories_dataframe(traj_df: DataFrame) -> DataFrame: """ Extracts information from trajectories. Parameters ---------- traj_df : DataFrame The input trajectories data Return ------ DataFrames Flat trajectories. Example ------- >>> from pymove.utils.data_augmentation import flatten_trajectories_dataframe >>> >>> traj_df id local 0 [1, 1, 1] [85, 673, 394] 1 [2, 2, 2, 2] [263, 224, 623, 515] >>> >>> flatten_trajectories_dataframe(traj_df) id local 0 1 85 1 1 673 2 1 394 3 2 263 4 2 224 5 2 623 6 2 515 """ frames = {} for idx, row in progress_bar(traj_df.iterrows(), total=traj_df.shape[0]): frames[idx] = pd.DataFrame(row.to_dict()) return pd.concat([frames[i] for i in range(len(frames))], ignore_index=True)
def create_bin_geohash_df(data: DataFrame, precision: Optional[float] = 15): """ Create trajectory geohash binaries and integrate with df. Parameters ---------- data : dataframe The input trajectories data precision : float, optional Number of characters in resulting geohash, by default 15 """ _, _, _, bin_geohash = _reset_and_create_arrays_none(data) for idx, row in progress_bar(data[[LATITUDE, LONGITUDE]].iterrows(), total=data.shape[0]): bin_geohash[idx] = _bin_geohash(row[LATITUDE], row[LONGITUDE], precision) data[BIN_GEOHASH] = bin_geohash print('\n================================================') print('\n=====> bin_geohash features was created. <======') print('\n================================================')
def knn_query( traj: DataFrame, move_df: DataFrame, k: Optional[int] = 5, id_: Optional[Text] = TRAJ_ID, distance: Optional[Text] = MEDP, latitude: Optional[Text] = LATITUDE, longitude: Optional[Text] = LONGITUDE, datetime: Optional[Text] = DATETIME ) -> DataFrame: """ Given a k, a trajectory and a DataFrame with multiple paths, it returns the k neighboring trajectories closest to the trajectory. Parameters ---------- traj: dataframe The input of one trajectory. move_df: dataframe The input trajectory data. k: int, optional neighboring trajectories, by default 5 id_: str, optional Label of the trajectories dataframe user id, by default TRAJ_ID distance: string, optional Distance measure type, by default MEDP latitude: string, optional Label of the trajectories dataframe referring to the latitude, by default LATITUDE longitude: string, optional Label of the trajectories dataframe referring to the longitude, by default LONGITUDE datetime: string, optional Label of the trajectories dataframe referring to the timestamp, by default DATETIME Returns ------- DataFrame dataframe with near trajectories Raises ------ ValueError: if distance measure is invalid """ k_list = pd.DataFrame([[np.Inf, 'empty']] * k, columns=['distance', TRAJ_ID]) if (distance == MEDP): def dist_measure(traj, this, latitude, longitude, datetime): return distances.MEDP( traj, this, latitude, longitude ) elif (distance == MEDT): def dist_measure(traj, this, latitude, longitude, datetime): return distances.MEDT( traj, this, latitude, longitude, datetime ) else: raise ValueError('Unknown distance measure. Use MEDP or MEDT') for traj_id in progress_bar( move_df[id_].unique(), desc='Querying knn by {}'.format(distance) ): if (traj_id != traj[id_].values[0]): this = move_df.loc[move_df[id_] == traj_id] this_distance = dist_measure( traj, this, latitude, longitude, datetime ) n = 0 for n in range(k): if (this_distance < k_list.loc[n, 'distance']): k_list.loc[n, 'distance'] = this_distance k_list.loc[n, 'traj_id'] = traj_id break n = n + 1 result = traj.copy() print('Gerando DataFrame com as k trajetórias mais próximas') for n in range(k): result = result.append( move_df.loc[move_df[id_] == k_list.loc[n, 'traj_id']] ) return result
def range_query( traj: DataFrame, move_df: DataFrame, _id: Optional[Text] = TRAJ_ID, min_dist: Optional[float] = 1000, distance: Optional[Text] = MEDP, latitude: Optional[Text] = LATITUDE, longitude: Optional[Text] = LONGITUDE, datetime: Optional[Text] = DATETIME ) -> DataFrame: """ Given a distance, a trajectory, and a DataFrame with several trajectories, it returns all trajectories that have a distance equal to or less than the informed trajectory. Parameters ---------- traj: dataframe The input of one trajectory. move_df: dataframe The input trajectory data. _id: str, optional Label of the trajectories dataframe user id, by default TRAJ_ID min_dist: float, optional Minimum distance measure, by default 1000 distance: string, optional Distance measure type, by default MEDP latitude: string, optional Label of the trajectories dataframe referring to the latitude, by default LATITUDE longitude: string, optional Label of the trajectories dataframe referring to the longitude, by default LONGITUDE datetime: string, optional Label of the trajectories dataframe referring to the timestamp, by default DATETIME Returns ------- DataFrame dataframe with near trajectories Raises ------ ValueError: if distance measure is invalid """ result = traj.copy() result.drop(result.index, inplace=True) if (distance == MEDP): def dist_measure(traj, this, latitude, longitude, datetime): return distances.MEDP( traj, this, latitude, longitude ) elif (distance == MEDT): def dist_measure(traj, this, latitude, longitude, datetime): return distances.MEDT( traj, this, latitude, longitude, datetime ) else: raise ValueError('Unknown distance measure. Use MEDP or MEDT') for traj_id in progress_bar( move_df[_id].unique(), desc='Querying range by {}'.format(distance) ): this = move_df.loc[move_df[_id] == traj_id] if dist_measure(traj, this, latitude, longitude, datetime) < min_dist: result = result.append(this) return result
def _filter_by(move_data, label_id, label_new_tid, drop_single_points, **kwargs): """ Splits the trajectories into segments. Parameters ---------- move_data : dataframe The input trajectory data label_id : String, optional(dic_labels["id"] by default) Indicates the label of the id column in the user"srs dataframe. label_new_tid : String, optional(TID_PART by default) The label of the column containing the ids of the formed segments. Is the new splitted id. drop_single_points : boolean, optional(True by default) If set to True, drops the trajectories with only one point. **kwargs : arguments depends on the type of segmentation - all : if is a segmentation by all features - max_dist : maximum dist between adjacent points - max_time : maximum time between adjacent points - max_speed : maximum speed between adjacent points - feature : feature to use for segmentation - max_between_adj_points : maximum value for feature Returns ------- dataframe DataFrame with the aditional features: label_new_tid, that indicates the trajectory segment to which the point belongs to. Note ---- Time, distance and speeed features must be updated after split. """ curr_tid, ids, count = _prepare_segmentation(move_data, label_id, label_new_tid) for idx in progress_bar(ids, desc='Generating %s' % label_new_tid): if kwargs['all']: filter_ = _filter_and_dist_time_speed(move_data, idx, kwargs['max_dist'], kwargs['max_time'], kwargs['max_speed']) else: filter_ = _filter_or_dist_time_speed( move_data, idx, kwargs['feature'], kwargs['max_between_adj_points']) curr_tid, count = _update_curr_tid_count(filter_, move_data, idx, label_new_tid, curr_tid, count) if label_id == label_new_tid: move_data.reset_index(drop=True, inplace=True) print('... label_tid = label_new_id, then reseting and drop index') else: move_data.reset_index(inplace=True) print('... Reseting index\n') if drop_single_points: _drop_single_point(move_data, label_new_tid, label_id) move_data.generate_dist_time_speed_features() return move_data
def query_all_points_by_range( traj1: DataFrame, move_df: DataFrame, minimum_meters: float = 100, minimum_time: timedelta = None ) -> DataFrame: """ Queries closest point within a spatial range based on meters and a temporal range. Selects only the points between two Move Dataframes that have the closest point within a spatial range based on meters and a temporal range. Parameters ---------- traj1: dataframe The input of a trajectory data. move_df: dataframe The input of another trajectory data. minimum_meters: float, optional the minimum spatial distance, based in meters, between the points, by default 100 minimum_time: datetime.timedelta, optional the minimum temporal distance between the points, by default timedelta(minutes=2) datetime_label: string, optional the label that refers to the datetime label of the dataframes, by default DATETIME Returns ------- DataFrame dataframe with all the points of move_df which are in a spatial distance and temporal distance equal or smaller than the minimum distance parameters. Examples -------- >>> from pymove.query.query import query_all_points_by_range >>> traj_df lat lon datetime id 0 16.4 -54.9 2014-10-11 18:00:00 1 1 16.4 -55.9 2014-10-12 00:00:00 1 2 16.4 -56.9 2014-10-12 06:00:00 1 >>> move_df lat lon datetime id 0 33.1 -77.0 2012-05-19 00:00:00 2 1 32.8 -77.1 2012-05-19 06:00:00 3 2 32.5 -77.3 2012-05-19 12:00:00 4 >>> query_all_points_by_range( >>> traj_df, move_df, minimum_meters=3190000, minimum_time=timedelta(hours=21010) >>> ) lat lon datetime id spatial_distance target_id\ target_lat target_lon target_datetime temporal_distance 0 32.5 -77.3 2012-05-19 12:00:00 4 3.182834e+06 1\ 16.4 -54.9 2014-10-11 18:00:00 875 days 06:00:00 """ if minimum_time is None: minimum_time = timedelta(minutes=2) result = DataFrame([]) total = traj1.shape[0] for _, row in progress_bar( traj1.iterrows(), desc='Querying all points by temporal and spatial distance', total=total ): coinc_points = _meters_filter(row, move_df, minimum_meters) coinc_points = _datetime_filter(row, coinc_points, minimum_time) result = coinc_points.append(result) return result
def range_query( traj: DataFrame, move_df: DataFrame, _id: str = TRAJ_ID, min_dist: float = 1000, distance: str = MEDP, latitude: str = LATITUDE, longitude: str = LONGITUDE, datetime: str = DATETIME ) -> DataFrame: """ Returns all trajectories that have a distance equal to or less than the trajectory. Given a distance, a trajectory, and a DataFrame with several trajectories. Parameters ---------- traj: dataframe The input of one trajectory. move_df: dataframe The input trajectory data. _id: str, optional Label of the trajectories dataframe user id, by default TRAJ_ID min_dist: float, optional Minimum distance measure, by default 1000 distance: string, optional Distance measure type, by default MEDP latitude: string, optional Label of the trajectories dataframe referring to the latitude, by default LATITUDE longitude: string, optional Label of the trajectories dataframe referring to the longitude, by default LONGITUDE datetime: string, optional Label of the trajectories dataframe referring to the timestamp, by default DATETIME Returns ------- DataFrame dataframe with near trajectories Raises ------ ValueError: if distance measure is invalid Examples -------- >>> from pymove.query.query import range_query >>> traj_df lat lon datetime id 0 16.4 -54.9 2014-10-11 18:00:00 1 1 16.4 -55.9 2014-10-12 00:00:00 1 2 16.4 -56.9 2014-10-12 06:00:00 1 >>> move_df lat lon datetime id 0 33.1 -77.0 2012-05-19 00:00:00 2 1 32.8 -77.1 2012-05-19 06:00:00 3 2 32.5 -77.3 2012-05-19 12:00:00 4 >>> range_query( >>> traj_df, move_df, min_dist=80.5 >>> ) lat lon datetime id 1 32.8 -77.1 2012-05-19 06:00:00 3 2 32.5 -77.3 2012-05-19 12:00:00 4 """ result = traj.copy() result.drop(result.index, inplace=True) if (distance == MEDP): def dist_measure(traj, this, latitude, longitude, datetime): return distances.medp( traj, this, latitude, longitude ) elif (distance == MEDT): def dist_measure(traj, this, latitude, longitude, datetime): return distances.medt( traj, this, latitude, longitude, datetime ) else: raise ValueError('Unknown distance measure. Use MEDP or MEDT') for traj_id in progress_bar( move_df[_id].unique(), desc=f'Querying range by {distance}' ): this = move_df.loc[move_df[_id] == traj_id] if dist_measure(traj, this, latitude, longitude, datetime) < min_dist: result = result.append(this) return result
def knn_query( traj: DataFrame, move_df: DataFrame, k: int = 5, id_: str = TRAJ_ID, distance: str = MEDP, latitude: str = LATITUDE, longitude: str = LONGITUDE, datetime: str = DATETIME ) -> DataFrame: """ Returns the k neighboring trajectories closest to the trajectory. Given a k, a trajectory and a DataFrame with multiple paths. Parameters ---------- traj: dataframe The input of one trajectory. move_df: dataframe The input trajectory data. k: int, optional neighboring trajectories, by default 5 id_: str, optional Label of the trajectories dataframe user id, by default TRAJ_ID distance: string, optional Distance measure type, by default MEDP latitude: string, optional Label of the trajectories dataframe referring to the latitude, by default LATITUDE longitude: string, optional Label of the trajectories dataframe referring to the longitude, by default LONGITUDE datetime: string, optional Label of the trajectories dataframe referring to the timestamp, by default DATETIME Returns ------- DataFrame dataframe with near trajectories Raises ------ ValueError: if distance measure is invalid Examples -------- >>> from pymove.query.query import knn_query >>> traj_df lat lon datetime id 0 16.4 -54.9 2014-10-11 18:00:00 1 1 16.4 -55.9 2014-10-12 00:00:00 1 2 16.4 -56.9 2014-10-12 06:00:00 1 >>> move_df lat lon datetime id 0 33.1 -77.0 2012-05-19 00:00:00 2 1 32.8 -77.1 2012-05-19 06:00:00 3 2 32.5 -77.3 2012-05-19 12:00:00 4 >>> knn_query( >>> traj_df, move_df, k=1 >>> ) lat lon datetime id 0 16.4 -54.9 2014-10-11 18:00:00 1 1 16.4 -55.9 2014-10-12 00:00:00 1 2 16.4 -56.9 2014-10-12 06:00:00 1 2 32.5 -77.3 2012-05-19 12:00:00 4 """ k_list = pd.DataFrame([[np.Inf, 'empty']] * k, columns=['distance', TRAJ_ID]) if (distance == MEDP): def dist_measure(traj, this, latitude, longitude, datetime): return distances.medp( traj, this, latitude, longitude ) elif (distance == MEDT): def dist_measure(traj, this, latitude, longitude, datetime): return distances.medt( traj, this, latitude, longitude, datetime ) else: raise ValueError('Unknown distance measure. Use MEDP or MEDT') for traj_id in progress_bar( move_df[id_].unique(), desc=f'Querying knn by {distance}' ): if (traj_id != traj[id_].values[0]): this = move_df.loc[move_df[id_] == traj_id] this_distance = dist_measure( traj, this, latitude, longitude, datetime ) n = 0 for n in range(k): if (this_distance < k_list.loc[n, 'distance']): k_list.loc[n, 'distance'] = this_distance k_list.loc[n, 'traj_id'] = traj_id break n = n + 1 result = traj.copy() logger.debug('Generating DataFrame with k nearest trajectories.') for n in range(k): result = result.append( move_df.loc[move_df[id_] == k_list.loc[n, 'traj_id']] ) return result
def compress_segment_stop_to_point( move_data: DataFrame, label_segment: str = SEGMENT_STOP, label_stop: str = STOP, point_mean: str = 'default', drop_moves: bool = False, label_id: str = TRAJ_ID, dist_radius: float = 30, time_radius: float = 900, inplace: bool = False, ) -> DataFrame: """ Compress the trajectories using the stop points in the dataframe. Compress a segment to point setting lat_mean e lon_mean to each segment. Parameters ---------- move_data : dataframe The input trajectory data label_segment : String, optional The label of the column containing the ids of the formed segments. Is the new splitted id, by default SEGMENT_STOP label_stop : String, optional Is the name of the column that indicates if a point is a stop, by default STOP point_mean : String, optional Indicates whether the mean points should be calculated using centroids or the point that repeat the most, by default 'default' drop_moves : Boolean, optional If set to true, the moving points will be dropped from the dataframe, by default False label_id : String, optional Used to create the stay points used in the compression. If the dataset already has the stop move, this parameter should be ignored. Indicates the label of the id column in the user dataframe, by default TRAJ_ID dist_radius : Double, optional Used to create the stay points used in the compression, by default 30 If the dataset already has the stop move, this parameter should be ignored. The first step in this function is segmenting the trajectory. The segments are used to find the stop points. The dist_radius defines the distance used in the segmentation. time_radius : Double, optional Used to create the stay points used in the compression, by default 900 If the dataset already has the stop move, this parameter should be ignored. The time_radius used to determine if a segment is a stop. If the user stayed in the segment for a time greater than time_radius, than the segment is a stop. inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame Data with 3 additional features: segment_stop, lat_mean and lon_mean or None segment_stop indicates the trajectory segment to which the point belongs lat_mean and lon_mean: if the default option is used, lat_mean and lon_mean are defined based on point that repeats most within the segment On the other hand, if centroid option is used, lat_mean and lon_mean are defined by centroid of the all points into segment """ if not inplace: move_data = move_data.copy() if (label_segment not in move_data) & (label_stop not in move_data): create_or_update_move_stop_by_dist_time(move_data, dist_radius, time_radius, label_id, inplace=True) logger.debug('...setting mean to lat and lon...') lat_mean = np.full(move_data.shape[0], -1.0, dtype=np.float64) lon_mean = np.full(move_data.shape[0], -1.0, dtype=np.float64) if drop_moves is False: lat_mean[move_data[~move_data[label_stop]].index] = np.NaN lon_mean[move_data[~move_data[label_stop]].index] = np.NaN else: logger.debug('...move segments will be dropped...') logger.debug('...get only segments stop...') segments = move_data[move_data[label_stop]][label_segment].unique() for idx in progress_bar( segments, desc=f'Generating {label_segment} and {label_stop}'): filter_ = move_data[label_segment] == idx size_id = move_data[filter_].shape[0] # verify if filter is None if size_id > 1: # get first and last point of each stop segment ind_start = move_data[filter_].iloc[[0]].index ind_end = move_data[filter_].iloc[[-1]].index if point_mean == 'default': p = (move_data[filter_].groupby([LATITUDE, LONGITUDE], as_index=False).agg({ 'id': 'count' }).sort_values(['id']).tail(1)) lat_mean[ind_start] = p.iloc[0, 0] lon_mean[ind_start] = p.iloc[0, 1] lat_mean[ind_end] = p.iloc[0, 0] lon_mean[ind_end] = p.iloc[0, 1] elif point_mean == 'centroid': # set lat and lon mean to first_point # and last points to each segment lat_mean[ind_start] = move_data.loc[filter_][LATITUDE].mean() lon_mean[ind_start] = move_data.loc[filter_][LONGITUDE].mean() lat_mean[ind_end] = move_data.loc[filter_][LATITUDE].mean() lon_mean[ind_end] = move_data.loc[filter_][LONGITUDE].mean() else: logger.debug(f'There are segments with only one point: {idx}') move_data[LAT_MEAN] = lat_mean move_data[LON_MEAN] = lon_mean del lat_mean del lon_mean shape_before = move_data.shape[0] # filter points to drop filter_drop = ((move_data[LAT_MEAN] == -1.0) & (move_data[LON_MEAN] == -1.0)) shape_drop = move_data[filter_drop].shape[0] if shape_drop > 0: logger.debug('...Dropping %s points...' % shape_drop) move_data.drop(move_data[filter_drop].index, inplace=True) logger.debug('...Shape_before: %s\n...Current shape: %s' % (shape_before, move_data.shape[0])) if not inplace: return move_data