def _bounding_boxes_greedy_(pd_trajectory, duration_min, diam_max1, duration_max1, diam_max2): """ Takes pandas data frame of trajectory data and splits it so that each frame contains consequentive data where location is contained within a bounding box. Greedy approach is used where a new bounding box is created if the a trajectory point does not satisfy any of the bounding box constraints specified by the parameters. :params pd_trajectory: pandas frame with trajectory data :params duration_min: float, skip trajectories whose duration is below this value :params diam_max1: float, maximum diameter of first resulting bounding box :params duration_max1: float, maximum duration of first resulting bounding box :params diam_max2: float, maximum diameter of second resulting bounding box """ trajectories = [] if len(pd_trajectory) == 0: return trajectories start_idx = 0 # start of current segment time_min = pd_trajectory.iloc[0]['time'] time_max = time_min lat_min = pd_trajectory.iloc[0]['latitude'] lat_max = lat_min long_min = pd_trajectory.iloc[0]['longitude'] long_max = long_min idx = 1 while idx < len(pd_trajectory): time, latitude, longitude = pd_trajectory.iloc[ idx, :].time, pd_trajectory.iloc[ idx, :].latitude, pd_trajectory.iloc[idx, :].longitude time_min = min(time_min, time) time_max = max(time_max, time) lat_min = min(lat_min, latitude) lat_max = max(lat_max, latitude) long_min = min(long_min, longitude) long_max = max(long_max, longitude) if ((haversine_distance(lat_min, long_min, lat_max, long_max) > diam_max1 and time_max - time_min <= duration_max1) or (haversine_distance(lat_min, long_min, lat_max, long_max) > diam_max2 and time_max - time_min > duration_max1)): # segment found # Skip segment if duration is too short if time_max - time_min < duration_min: logger.warning( f"Skipping trajectory segment due to short duration {time_max-time_min}" ) else: trajectories.append(pd_trajectory.iloc[start_idx:idx, :]) start_idx = idx time_min, time_max = time, time lat_min, lat_max = latitude, latitude long_min, long_max = longitude, longitude idx += 1 if start_idx < len(pd_trajectory) - 1: trajectories.append(pd_trajectory.iloc[start_idx:, :]) return trajectories
def conseq_distance_meters(frame): '''Frame[col=["latitude", "longitude", ...]] -> distances of conseq rows''' lat, lon = frame['latitude'].to_numpy(), frame['longitude'].to_numpy() return np.array([ haversine_distance(lat[i], lon[i], lat[i + 1], lon[i + 1]) for i in range(len(lat) - 1) ])
def _diam_(pd_trajectory): """ Computes the diameter of a trajectory """ lat_min = pd_trajectory['latitude'].min() lat_max = pd_trajectory['latitude'].max() long_min = pd_trajectory['longitude'].min() long_max = pd_trajectory['longitude'].max() return haversine_distance(lat_min, long_min, lat_max, long_max)
def _find_sequence_startpoints(self, allowed_jump, hard_time_gap): """ Returns ordered list of indices with time sequence start points. A time sequence satisfies that there are no two time points further than hard_time_gap away. Furthermore in a time gap more than soft_time_gap, the travelled distance must be less than allowed_jump to keep a connected time sequence (otherwise we add a break point) Note that index 0 is always contained as a start point. """ time_steps = np.diff(self.data[:, 0]) sequence_startpoints = [0] mask = [True for i in range(self.get_n_time_stamps())] break_due_to_time = [] for i in range(self.get_n_time_stamps() - 1): if time_steps[i] >= hard_time_gap: continue distance_jumped = haversine_distance(self.data[i, 1], self.data[i, 2], self.data[i + 1, 1], self.data[i + 1, 2]) if distance_jumped > allowed_jump: continue mask[i + 1] = False sequence_startpoints.extend( [i for i, val in enumerate(mask) if (val and i != 0)]) return sequence_startpoints
def convolution_filtre(path_1, path_2, timestep, weight_dist_max, weight_dist_min, weight_min_val, filtre_size): distance = 0 count_weights = 0 total_accuracy = 0 # loop on nearby time-steps for ti in range(timestep - int(filtre_size/2), timestep + int(filtre_size/2) + 1): # there is no padding, so if we encounter values outside of boundaries, we pass if ti < 0 or ti >= path_1.shape[0] or ti >= path_2.shape[0]: continue # similarly we pass if there are some undefined values if path_1[ti,0]==0 or path_1[ti,1]==0 or path_2[ti,0]==0 or path_2[ti,1]==0: continue # compute weights based on accuracy w1 = weight_accuracy(path_1[ti,2], weight_dist_max, weight_dist_min, weight_min_val) w2 = weight_accuracy(path_2[ti,2], weight_dist_max, weight_dist_min, weight_min_val) # try to replace with nearby values if strong innacuracy ti_path_1 = ti ti_path_2 = ti # check for w1 if w1==weight_min_val: if ti>=1: w1_new = weight_accuracy(path_1[ti-1,2], weight_dist_max, weight_dist_min, weight_min_val) if w1_new>w1 and path_1[ti-1,0]!=0 and path_1[ti-1,1]!=0: w1 = w1_new ti_path_1 = ti - 1 if ti<path_1.shape[0]-1: w1_new = weight_accuracy(path_1[ti+1,2], weight_dist_max, weight_dist_min, weight_min_val) if w1_new>w1 and path_1[ti+1,0]!=0 and path_1[ti+1,1]!=0: w1 = w1_new ti_path_1 = ti + 1 # check for w2 if w2==weight_min_val: if ti>1: w2_new = weight_accuracy(path_2[ti-1,2], weight_dist_max, weight_dist_min, weight_min_val) if w2_new>w2 and path_2[ti-1,0]!=0 and path_2[ti-1,1]!=0: w2 = w2_new ti_path_2 = ti - 1 if ti<path_2.shape[0]-1: w2_new = weight_accuracy(path_2[ti+1,2], weight_dist_max, weight_dist_min, weight_min_val) if w2_new>w2 and path_2[ti+1,0]!=0 and path_2[ti+1,1]!=0: w2 = w2_new ti_path_2 = ti + 1 # compute weights and distances count_weights += w1 * w2 distance += w1*w2*haversine_distance(path_1[ti_path_1,0],path_1[ti_path_1,1],path_2[ti_path_2,0],path_2[ti_path_2,1]) total_accuracy += w1*w2*(path_1[ti_path_1,2] + path_2[ti_path_2,2]) # return distance estimations if count_weights != 0: dist_estimate = distance/count_weights dist_min = (distance-total_accuracy)/count_weights dist_max = (distance+total_accuracy)/count_weights return (dist_estimate, max(0,dist_min), dist_max) else: return (1e9,1e9,1e9)
def add_distance_to_df(df): """ Adds distance as a column to the supplied pd.DataFrame""" lats1 = np.array(df["latitude"].iloc[:-1].values, dtype=np.float64) lats2 = np.array(df["latitude"].iloc[1:].values, dtype=np.float64) lons1 = np.array(df["longitude"].iloc[:-1].values, dtype=np.float64) lons2 = np.array(df["longitude"].iloc[1:].values, dtype=np.float64) distance = [0] for lat1, lat2, lon1, lon2 in zip(lats1, lats2, lons1, lons2): distance.append( haversine_distance(lat1=lat1, lon1=lon1, lat2=lat2, lon2=lon2)) df.insert(len(df.columns), "distance", distance)
def get_max_dist_meters(frame): '''Frame[col=["latitude", "longitude", ...]] -> max distance of rows''' if not len(frame): return 0 res = 0 lat, lon = frame['latitude'].to_numpy(), frame['longitude'].to_numpy() for i, (ai, oi) in enumerate(zip(lat, lon)): for (aj, oj) in zip(lat[i + 1:], lon[i + 1:]): res = max(res, haversine_distance(ai, oi, aj, oj)) return res
def add_mode_of_transport_to_df(df, stop_duration_threshold=30, distance_threshold=30, pt_search_radius=10): """ Adds transport mode as a column to the supplied pd.DataFrame""" transport = [mode_of_transport_from_speed(0)] center_point = df.iloc[0] last_point = df.iloc[0] potential_still_points = [] potential_move_points = [] stop_points = [] stop_duration = 0 for index, current_point in df.iloc[1:].iterrows(): time_since_last_point = current_point.timefrom - last_point.timeto distance_from_center = haversine_distance(lat1=center_point.latitude, lon1=center_point.longitude, lat2=current_point.latitude, lon2=current_point.longitude) predicted = mode_of_transport_from_speed(current_point.speed * 3.6) potential_move_points.append(predicted) if distance_from_center < distance_threshold: potential_still_points.append(_TRANSPORT_TYPES[0]) stop_duration += time_since_last_point else: if stop_duration > stop_duration_threshold: stop_points.append( (index - len(potential_still_points), index)) potential_still_points.append(_TRANSPORT_TYPES[0]) transport.extend(potential_still_points) else: transport.extend(potential_move_points) potential_move_points = [] potential_still_points = [] center_point = current_point stop_duration = 0 last_point = current_point transport.extend(potential_still_points) df.insert(len(df.columns), "transport", transport)
def inspect(self, allowed_jump, time_gap): """ Method for inspecting given trajectory data. Outputs: - time span - all 'gaps' in the data where either the time gap is surpassed or the distance moved is more than allowed_jump """ print("\n") self.__str__() if self._empty_(): print("Data array is empty - no inspection possible") return print( "Data covers period {0} - {1} Time Delta = {2} (h,m,s) n_timestamps = {3}" .format( datetime.utcfromtimestamp( self.get_min_time()).strftime('%Y-%m-%d %H:%M:%S'), datetime.utcfromtimestamp( self.get_max_time()).strftime('%Y-%m-%d %H:%M:%S'), convert_seconds(self.get_max_time() - self.get_min_time()), self.get_n_time_stamps())) print("GPS Gaps:") time_gap_s = time_gap * 60 * 60 gaps = self._find_sequence_startpoints(allowed_jump, time_gap_s)[1:] for s, gap in enumerate(gaps): distance = haversine_distance(self.data[gap - 1, 1], self.data[gap - 1, 2], self.data[gap, 1], self.data[gap, 2]) print( " * {0} - {1} Time Delta = {2} (h,m,s) Distance {3}m GPS accuracy {4}m" .format( datetime.utcfromtimestamp( self.data[gap - 1, 0]).strftime('%Y-%m-%d %H:%M:%S'), datetime.utcfromtimestamp( self.data[gap, 0]).strftime('%Y-%m-%d %H:%M:%S'), convert_seconds(self.data[gap, 0] - self.data[gap - 1, 0]), round(distance), max(self.data[gap - 1, 3], self.data[gap, 3])))
def load_azure_data(query, outlier_threshold=100, include_attributes=_DEFAULT_INCLUDE_ATTRIBUTES, dt_threshold=None, dx_threshold=None): """ Loads data from the Azure database and returns a dictionary of uuids and user events. dt_threshold is None or number. None keeps original data. With number data is filter such that 2 conseq events are at least dt_threshold apart. NOTE: dt_threshold value is in seconds dx_threshold is None or number. None keeps original data. With number a distance threshold (in meters) in data is applied such that kepts consecutive events have distance > dx_threshold. """ with timer("db connect"): db = connect_to_azure_database() db_func = re.search('(FROM|from) (\w*)', query).group(2) with timer(f"db query {db_func}"): df = pd.read_sql( query, con=db, parse_dates=["timeto", "timefrom"], ) db.close() df = df.sort_values(by='timefrom') df = df.reset_index(drop=True) # Time coarse if dt_threshold is not None: # NOTE: here we get timestamps and timedelta as their diff so convert # threshold for comparison assert dt_threshold > 0 dt_threshold = timedelta(days=0, seconds=dt_threshold) # Setup for recreating a valid (with correct columns) but empty # frame keys = list(df.keys()) df = df[sparsify_mask(df['timefrom'], dt_threshold)] if not len(df): print('GPS time coarsening yielded empty frame') df = pd.DataFrame(columns=keys) # Space coarsen if dx_threshold is not None: assert dx_threshold > 0 position = np.c_[df['latitude'].to_numpy(), df['longitude'].to_numpy()] # Inside filter we want to get distance between rows x, y distance = lambda x, y: haversine_distance(x[0], x[1], y[0], y[1]) keys = list(df.keys()) df = df[sparsify_mask(position, threshold=dx_threshold, distance=distance)] if not len(df): print('GPS distance coarsening yielded empty frame') df = pd.DataFrame(columns=keys) df = df.loc[:, include_attributes] data_dict = {} for uuid in df.uuid.unique(): user_data = df.loc[df["uuid"] == uuid] data_dict[uuid.lower()] = process_data_frame(user_data, outlier_threshold) return data_dict
def sqm(self) -> float: return haversine_distance(self.minlat, self.minlon, self.maxlat, self.minlon) * \ haversine_distance(self.minlat, self.minlon, self.minlat, self.maxlon)