def continue_just_ended(self, idx, currPoint, filtered_points_df): """ Normally, since the logic here and the logic on the phone are the same, if we have detected a trip end, any points after this are part of the new trip. However, in some circumstances, notably in my data from 27th August, there appears to be a mismatch and we get a couple of points past the end that we detected here. So let's look for points that are within the distance filter, and are at a delta of a minute, and join them to the just ended trip instead of using them to start the new trip :param idx: Index of the current point :param currPoint: current point :param filtered_points_df: dataframe of filtered points :return: True if we should continue the just ended trip, False otherwise """ if idx == 0: return False else: lastPoint = ad.AttrDict(filtered_points_df.iloc[idx - 1]) logging.debug("Comparing with lastPoint = %s, distance = %s, time = %s" % (lastPoint, pf.calDistance(lastPoint, currPoint) < self.distance_threshold, currPoint.ts - lastPoint.ts <= self.time_threshold)) # Unlike the time filter, with the distance filter, we concatenate all points # that are within the distance threshold with the previous trip # end, since because of the distance filter, even noisy points # can occur at an arbitrary time in the future if pf.calDistance(lastPoint, currPoint) < self.distance_threshold: logging.info("Points %s and %s are within the distance filter so part of the same trip" % (lastPoint, currPoint)) return True else: return False
def continue_just_ended(self, idx, currPoint, filtered_points_df): """ Normally, since the logic here and the logic on the phone are the same, if we have detected a trip end, any points after this are part of the new trip. However, in some circumstances, notably in my data from 27th August, there appears to be a mismatch and we get a couple of points past the end that we detected here. So let's look for points that are within the distance filter, and are at a delta of a minute, and join them to the just ended trip instead of using them to start the new trip :param idx: Index of the current point :param currPoint: current point :param filtered_points_df: dataframe of filtered points :return: True if we should continue the just ended trip, False otherwise """ if idx == 0: return False else: prev_point = ad.AttrDict(filtered_points_df.iloc[idx - 1]) logging.debug("Comparing with prev_point = %s" % prev_point) if pf.calDistance(prev_point, currPoint) < self.distance_threshold and \ currPoint.ts - prev_point.ts <= 60: logging.info("Points %s and %s are within the distance filter and only 1 min apart so part of the same trip" % (prev_point, currPoint)) return True else: return False
def recalc_speed(points_df): """ The input dataframe already has "speed" and "distance" columns. Drop them and recalculate speeds from the first point onwards. The speed column has the speed between each point and its previous point. The first row has a speed of zero. """ stripped_df = points_df.drop("speed", axis=1).drop("distance", axis=1) logging.debug("columns in points_df = %s" % points_df.columns) point_list = [ad.AttrDict(row) for row in points_df.to_dict('records')] zipped_points_list = list(zip(point_list, point_list[1:])) distances = [pf.calDistance(p1, p2) for (p1, p2) in zipped_points_list] distances.insert(0, 0) with_speeds_df = pd.concat([ stripped_df, pd.Series(distances, index=points_df.index, name="distance") ], axis=1) speeds = [pf.calSpeed(p1, p2) for (p1, p2) in zipped_points_list] speeds.insert(0, 0) with_speeds_df = pd.concat([ with_speeds_df, pd.Series(speeds, index=points_df.index, name="speed") ], axis=1) return with_speeds_df
def continue_just_ended(self, idx, currPoint, filtered_points_df): """ Normally, since the logic here and the logic on the phone are the same, if we have detected a trip end, any points after this are part of the new trip. However, in some circumstances, notably in my data from 27th August, there appears to be a mismatch and we get a couple of points past the end that we detected here. So let's look for points that are within the distance filter, and are at a delta of a minute, and join them to the just ended trip instead of using them to start the new trip :param idx: Index of the current point :param currPoint: current point :param filtered_points_df: dataframe of filtered points :return: True if we should continue the just ended trip, False otherwise """ if idx == 0: return False else: prev_point = ad.AttrDict(filtered_points_df.iloc[idx - 1]) logging.debug("Comparing with prev_point = %s" % prev_point) if pf.calDistance(prev_point, currPoint) < self.distance_threshold and \ currPoint.ts - prev_point.ts <= 60: logging.info( "Points %s and %s are within the distance filter and only 1 min apart so part of the same trip" % (prev_point, currPoint)) return True else: return False
def add_dist_heading_speed(points_df): # type: (pandas.DataFrame) -> pandas.DataFrame """ Returns a new dataframe with an added "speed" column. The speed column has the speed between each point and its previous point. The first row has a speed of zero. """ point_list = [ad.AttrDict(row) for row in points_df.to_dict('records')] zipped_points_list = list(zip(point_list, point_list[1:])) distances = [pf.calDistance(p1, p2) for (p1, p2) in zipped_points_list] distances.insert(0, 0) speeds = [pf.calSpeed(p1, p2) for (p1, p2) in zipped_points_list] speeds.insert(0, 0) headings = [pf.calHeading(p1, p2) for (p1, p2) in zipped_points_list] headings.insert(0, 0) with_distances_df = pd.concat( [points_df, pd.Series(distances, name="distance")], axis=1) with_speeds_df = pd.concat( [with_distances_df, pd.Series(speeds, name="speed")], axis=1) if "heading" in with_speeds_df.columns: with_speeds_df.drop("heading", axis=1, inplace=True) with_headings_df = pd.concat( [with_speeds_df, pd.Series(headings, name="heading")], axis=1) return with_headings_df
def continue_just_ended(self, idx, currPoint, filtered_points_df): """ Normally, since the logic here and the logic on the phone are the same, if we have detected a trip end, any points after this are part of the new trip. However, in some circumstances, notably in my data from 27th August, there appears to be a mismatch and we get a couple of points past the end that we detected here. So let's look for points that are within the distance filter, and are at a delta of a minute, and join them to the just ended trip instead of using them to start the new trip :param idx: Index of the current point :param currPoint: current point :param filtered_points_df: dataframe of filtered points :return: True if we should continue the just ended trip, False otherwise """ if idx == 0: return False else: lastPoint = ad.AttrDict(filtered_points_df.iloc[idx - 1]) logging.debug( "Comparing with lastPoint = %s, distance = %s, time = %s" % (lastPoint, pf.calDistance( lastPoint, currPoint) < self.distance_threshold, currPoint.ts - lastPoint.ts <= self.time_threshold)) # Unlike the time filter, with the distance filter, we concatenate all points # that are within the distance threshold with the previous trip # end, since because of the distance filter, even noisy points # can occur at an arbitrary time in the future if pf.calDistance(lastPoint, currPoint) < self.distance_threshold: logging.info( "Points %s and %s are within the distance filter so part of the same trip" % (lastPoint, currPoint)) return True else: return False
def recalc_speed(points_df): """ The input dataframe already has "speed" and "distance" columns. Drop them and recalculate speeds from the first point onwards. The speed column has the speed between each point and its previous point. The first row has a speed of zero. """ stripped_df = points_df.drop("speed", axis=1).drop("distance", axis=1) point_list = [ad.AttrDict(row) for row in points_df.to_dict('records')] zipped_points_list = zip(point_list, point_list[1:]) distances = [pf.calDistance(p1, p2) for (p1, p2) in zipped_points_list] distances.insert(0, 0) with_speeds_df = pd.concat([stripped_df, pd.Series(distances, index=points_df.index, name="distance")], axis=1) speeds = [pf.calSpeed(p1, p2) for (p1, p2) in zipped_points_list] speeds.insert(0, 0) with_speeds_df = pd.concat([with_speeds_df, pd.Series(speeds, index=points_df.index, name="speed")], axis=1) return with_speeds_df
def has_trip_ended(self, prev_point, curr_point, last10PointsDistances, last5MinsDistances, last5MinTimes): # Another mismatch between phone and server. Phone stops tracking too soon, # so the distance is still greater than the threshold at the end of the trip. # But then the next point is a long time away, so we can split again (similar to a distance filter) if prev_point is None: logging.debug("prev_point is None, continuing trip") else: timeDelta = curr_point.ts - prev_point.ts distDelta = pf.calDistance(prev_point, curr_point) if timeDelta > 0: speedDelta = distDelta / timeDelta else: speedDelta = np.nan speedThreshold = float(self.distance_threshold) / self.time_threshold if (timeDelta > 2 * self.time_threshold and # We have been here for a while speedDelta < speedThreshold): # we haven't moved very much logging.debug("prev_point.ts = %s, curr_point.ts = %s, threshold = %s, large gap = %s, ending trip" % (prev_point.ts, curr_point.ts,self.time_threshold, curr_point.ts - prev_point.ts)) return True else: logging.debug("prev_point.ts = %s, curr_point.ts = %s, time gap = %s (vs %s), distance_gap = %s (vs %s), speed_gap = %s (vs %s) continuing trip" % (prev_point.ts, curr_point.ts, timeDelta, self.time_threshold, distDelta, self.distance_threshold, speedDelta, speedThreshold)) # The -30 is a fuzz factor intended to compensate for older clients # where data collection stopped after 5 mins, so that we never actually # see 5 mins of data if (len(last10PointsDistances) < self.point_threshold - 1 or len(last5MinsDistances) == 0 or last5MinTimes.max() < self.time_threshold - 30): logging.debug("Too few points to make a decision, continuing") return False # Normal end-of-trip case logging.debug("last5MinsDistances.max() = %s, last10PointsDistance.max() = %s" % (last5MinsDistances.max(), last10PointsDistances.max())) if (last5MinsDistances.max() < self.distance_threshold and last10PointsDistances.max() < self.distance_threshold): return True
def add_dist_heading_speed(points_df): """ Returns a new dataframe with an added "speed" column. The speed column has the speed between each point and its previous point. The first row has a speed of zero. """ point_list = [ad.AttrDict(row) for row in points_df.to_dict('records')] zipped_points_list = zip(point_list, point_list[1:]) distances = [pf.calDistance(p1, p2) for (p1, p2) in zipped_points_list] distances.insert(0, 0) speeds = [pf.calSpeed(p1, p2) for (p1, p2) in zipped_points_list] speeds.insert(0, 0) headings = [pf.calHeading(p1, p2) for (p1, p2) in zipped_points_list] headings.insert(0, 0) with_distances_df = pd.concat([points_df, pd.Series(distances, name="distance")], axis=1) with_speeds_df = pd.concat([with_distances_df, pd.Series(speeds, name="speed")], axis=1) with_headings_df = pd.concat([with_speeds_df, pd.Series(headings, name="heading")], axis=1) return with_headings_df
def has_trip_ended(self, lastPoint, currPoint, timeseries): # So we must not have been moving for the last _time filter_ # points. So the trip must have ended # Since this is a distance filter, we detect that the last # trip has ended at the time that the new trip starts. So # if the last_trip_end_point is lastPoint, then # curr_trip_start_point should be currPoint. But then we will # have problems with the spurious, noisy points that are # generated until the geofence is turned on, if ever # So we will continue to defer new trip starting until we # have worked through all of those. timeDelta = currPoint.ts - lastPoint.ts distDelta = pf.calDistance(lastPoint, currPoint) logging.debug("lastPoint = %s, time difference = %s dist difference %s" % (lastPoint, timeDelta, distDelta)) if timeDelta > self.time_threshold: # We have been at this location for more than the time filter. # This could be because we have not been moving for the last # _time filter_ points, or because we didn't get points for # that duration, (e.g. because we were underground) if timeDelta > 0: speedDelta = distDelta / timeDelta else: speedDelta = np.nan # this is way too slow. On ios, we use 5meters in 10 minutes. # On android, we use 10 meters in 5 mins, which seems to work better # for this kind of test speedThreshold = float(self.distance_threshold * 2) / (self.time_threshold / 2) if eaisr.is_tracking_restarted_in_range(lastPoint.ts, currPoint.ts, timeseries): logging.debug("tracking was restarted, ending trip") return True # In general, we get multiple locations between each motion activity. If we see a bunch of motion activities # between two location points, and there is a large gap between the last location and the first # motion activity as well, let us just assume that there was a restart ongoing_motion_check = len(eaisr.get_ongoing_motion_in_range(lastPoint.ts, currPoint.ts, timeseries)) > 0 if timeDelta > self.time_threshold and not ongoing_motion_check: logging.debug("lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ongoing_motion_in_range = %s, ending trip" % (lastPoint.ts, currPoint.ts,self.time_threshold, currPoint.ts - lastPoint.ts, ongoing_motion_check)) return True # http://www.huffingtonpost.com/hoppercom/the-worlds-20-longest-non-stop-flights_b_5994268.html # Longest flight is 17 hours, which is the longest you can go without cell reception # And even if you split an air flight that long into two, you will get some untracked time in the # middle, so that's good. TWELVE_HOURS = 12 * 60 * 60 if timeDelta > TWELVE_HOURS: logging.debug("lastPoint.ts = %s, currPoint.ts = %s, TWELVE_HOURS = %s, large gap = %s, ending trip" % (lastPoint.ts, currPoint.ts, TWELVE_HOURS, currPoint.ts - lastPoint.ts)) return True if (timeDelta > self.time_threshold and # We have been here for a while speedDelta < speedThreshold): # we haven't moved very much logging.debug("lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ending trip" % (lastPoint.ts, currPoint.ts,self.time_threshold, currPoint.ts - lastPoint.ts)) return True else: logging.debug("lastPoint.ts = %s, currPoint.ts = %s, time gap = %s (vs %s), distance_gap = %s (vs %s), speed_gap = %s (vs %s) continuing trip" % (lastPoint.ts, currPoint.ts, timeDelta, self.time_threshold, distDelta, self.distance_threshold, speedDelta, speedThreshold)) return False
def has_trip_ended(self, lastPoint, currPoint, timeseries): # So we must not have been moving for the last _time filter_ # points. So the trip must have ended # Since this is a distance filter, we detect that the last # trip has ended at the time that the new trip starts. So # if the last_trip_end_point is lastPoint, then # curr_trip_start_point should be currPoint. But then we will # have problems with the spurious, noisy points that are # generated until the geofence is turned on, if ever # So we will continue to defer new trip starting until we # have worked through all of those. timeDelta = currPoint.ts - lastPoint.ts distDelta = pf.calDistance(lastPoint, currPoint) logging.debug( "lastPoint = %s, time difference = %s dist difference %s" % (lastPoint, timeDelta, distDelta)) if timeDelta > self.time_threshold: # We have been at this location for more than the time filter. # This could be because we have not been moving for the last # _time filter_ points, or because we didn't get points for # that duration, (e.g. because we were underground) if timeDelta > 0: speedDelta = old_div(distDelta, timeDelta) else: speedDelta = np.nan # this is way too slow. On ios, we use 5meters in 10 minutes. # On android, we use 10 meters in 5 mins, which seems to work better # for this kind of test speedThreshold = old_div(float(self.distance_threshold * 2), (old_div(self.time_threshold, 2))) if eaisr.is_tracking_restarted_in_range(lastPoint.ts, currPoint.ts, timeseries): logging.debug("tracking was restarted, ending trip") return True # In general, we get multiple locations between each motion activity. If we see a bunch of motion activities # between two location points, and there is a large gap between the last location and the first # motion activity as well, let us just assume that there was a restart ongoing_motion_in_range = eaisr.get_ongoing_motion_in_range( lastPoint.ts, currPoint.ts, timeseries) ongoing_motion_check = len(ongoing_motion_in_range) > 0 if timeDelta > self.time_threshold and not ongoing_motion_check: logging.debug( "lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ongoing_motion_in_range = %s, ending trip" % (lastPoint.ts, currPoint.ts, self.time_threshold, currPoint.ts - lastPoint.ts, ongoing_motion_check)) return True # http://www.huffingtonpost.com/hoppercom/the-worlds-20-longest-non-stop-flights_b_5994268.html # Longest flight is 17 hours, which is the longest you can go without cell reception # And even if you split an air flight that long into two, you will get some untracked time in the # middle, so that's good. TWELVE_HOURS = 12 * 60 * 60 if timeDelta > TWELVE_HOURS: logging.debug( "lastPoint.ts = %s, currPoint.ts = %s, TWELVE_HOURS = %s, large gap = %s, ending trip" % (lastPoint.ts, currPoint.ts, TWELVE_HOURS, currPoint.ts - lastPoint.ts)) return True if (timeDelta > self.time_threshold and # We have been here for a while speedDelta < speedThreshold): # we haven't moved very much # This can happen even during ongoing trips due to spurious points # generated on some iOS phones # https://github.com/e-mission/e-mission-server/issues/577#issuecomment-376379460 if eaistc.is_huge_invalid_ts_offset(self, lastPoint, currPoint, timeseries, ongoing_motion_in_range): # invalidate from memory and the database. logging.debug("About to set valid column for index = %s" % (currPoint.idx)) self.filtered_points_df.valid.iloc[currPoint.idx] = False logging.debug("After dropping %d, filtered points = %s" % (currPoint.idx, self.filtered_points_df. iloc[currPoint.idx - 5:currPoint.idx + 5][["valid", "fmt_time"]])) logging.debug("remove huge invalid ts offset point = %s" % currPoint) timeseries.invalidate_raw_entry(currPoint["_id"]) # We currently re-retrieve the last point every time, so # searching upwards is good enough but if we use # lastPoint = currPoint, we should update currPoint here return False else: logging.debug( "lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ending trip" % (lastPoint.ts, currPoint.ts, self.time_threshold, currPoint.ts - lastPoint.ts)) return True else: logging.debug( "lastPoint.ts = %s, currPoint.ts = %s, time gap = %s (vs %s), distance_gap = %s (vs %s), speed_gap = %s (vs %s) continuing trip" % (lastPoint.ts, currPoint.ts, timeDelta, self.time_threshold, distDelta, self.distance_threshold, speedDelta, speedThreshold)) return False
def segment_into_trips(self, timeseries, time_query): """ Examines the timeseries database for a specific range and returns the segmentation points. Note that the input is the entire timeseries and the time range. This allows algorithms to use whatever combination of data that they want from the sensor streams in order to determine the segmentation points. """ filtered_points_pre_ts_diff_df = timeseries.get_data_df("background/filtered_location", time_query) # Sometimes, we can get bogus points because data.ts and # metadata.write_ts are off by a lot. If we don't do this, we end up # appearing to travel back in time # https://github.com/e-mission/e-mission-server/issues/457 filtered_points_df = filtered_points_pre_ts_diff_df[(filtered_points_pre_ts_diff_df.metadata_write_ts - filtered_points_pre_ts_diff_df.ts) < 1000] filtered_points_df.reset_index(inplace=True) transition_df = timeseries.get_data_df("statemachine/transition", time_query) if len(transition_df) > 0: logging.debug("transition_df = %s" % transition_df[["fmt_time", "transition"]]) else: logging.debug("no transitions found. This can happen for continuous sensing") self.last_ts_processed = None logging.info("Last ts processed = %s" % self.last_ts_processed) segmentation_points = [] last_trip_end_point = None curr_trip_start_point = None just_ended = True prevPoint = None for idx, row in filtered_points_df.iterrows(): currPoint = ad.AttrDict(row) currPoint.update({"idx": idx}) logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30) if curr_trip_start_point is None: logging.debug("Appending currPoint because the current start point is None") # segmentation_points.append(currPoint) if just_ended: if self.continue_just_ended(idx, currPoint, filtered_points_df): # We have "processed" the currPoint by deciding to glom it self.last_ts_processed = currPoint.metadata_write_ts continue # else: sel_point = currPoint logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx)) curr_trip_start_point = sel_point just_ended = False last5MinsPoints_df = filtered_points_df[np.logical_and( np.logical_and( filtered_points_df.ts > currPoint.ts - self.time_threshold, filtered_points_df.ts < currPoint.ts), filtered_points_df.ts >= curr_trip_start_point.ts)] # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive. # Using .iloc just ends up including points after this one. # So we reset_index upstream and use it here. # We are going to use the last 8 points for now. # TODO: Change this back to last 10 points once we normalize phone and this last10Points_df = filtered_points_df.iloc[max(idx-self.point_threshold, curr_trip_start_point.idx):idx+1] distanceToLast = lambda row: pf.calDistance(ad.AttrDict(row), currPoint) timeToLast = lambda row: currPoint.ts - ad.AttrDict(row).ts last5MinsDistances = last5MinsPoints_df.apply(distanceToLast, axis=1) logging.debug("last5MinsDistances = %s with length %d" % (last5MinsDistances.as_matrix(), len(last5MinsDistances))) last10PointsDistances = last10Points_df.apply(distanceToLast, axis=1) logging.debug("last10PointsDistances = %s with length %d, shape %s" % (last10PointsDistances.as_matrix(), len(last10PointsDistances), last10PointsDistances.shape)) # Fix for https://github.com/e-mission/e-mission-server/issues/348 last5MinTimes = last5MinsPoints_df.apply(timeToLast, axis=1) logging.debug("len(last10PointsDistances) = %d, len(last5MinsDistances) = %d" % (len(last10PointsDistances), len(last5MinsDistances))) logging.debug("last5MinsTimes.max() = %s, time_threshold = %s" % (last5MinTimes.max() if len(last5MinTimes) > 0 else np.NaN, self.time_threshold)) if self.has_trip_ended(prevPoint, currPoint, timeseries, last10PointsDistances, last5MinsDistances, last5MinTimes): (ended_before_this, last_trip_end_point) = self.get_last_trip_end_point(filtered_points_df, last10Points_df, last5MinsPoints_df) segmentation_points.append((curr_trip_start_point, last_trip_end_point)) logging.info("Found trip end at %s" % last_trip_end_point.fmt_time) # We have processed everything up to the trip end by marking it as a completed trip self.last_ts_processed = currPoint.metadata_write_ts if ended_before_this: # in this case, we end a trip at the previous point, and the next trip starts at this # point, not the next one just_ended = False prevPoint = currPoint curr_trip_start_point = currPoint logging.debug("Setting new trip start point %s with idx %s" % (currPoint, currPoint.idx)) else: # We end a trip at the current point, and the next trip starts at the next point just_ended = True prevPoint = None else: prevPoint = currPoint logging.debug("Iterated over all points, just_ended = %s, len(transition_df) = %s" % (just_ended, len(transition_df))) if not just_ended and len(transition_df) > 0: stopped_moving_after_last = transition_df[(transition_df.ts > currPoint.ts) & (transition_df.transition == 2)] logging.debug("looking after %s, found transitions %s" % (currPoint.ts, stopped_moving_after_last)) if len(stopped_moving_after_last) > 0: (unused, last_trip_end_point) = self.get_last_trip_end_point(filtered_points_df, last10Points_df, None) segmentation_points.append((curr_trip_start_point, last_trip_end_point)) logging.debug("Found trip end at %s" % last_trip_end_point.fmt_time) # We have processed everything up to the trip end by marking it as a completed trip self.last_ts_processed = currPoint.metadata_write_ts return segmentation_points
def has_trip_ended(self, prev_point, curr_point, timeseries, last10PointsDistances, last5MinsDistances, last5MinTimes): # Another mismatch between phone and server. Phone stops tracking too soon, # so the distance is still greater than the threshold at the end of the trip. # But then the next point is a long time away, so we can split again (similar to a distance filter) if prev_point is None: logging.debug("prev_point is None, continuing trip") else: timeDelta = curr_point.ts - prev_point.ts distDelta = pf.calDistance(prev_point, curr_point) if timeDelta > 0: speedDelta = old_div(distDelta, timeDelta) else: speedDelta = np.nan speedThreshold = old_div(float(self.distance_threshold), self.time_threshold) if eaisr.is_tracking_restarted_in_range(prev_point.ts, curr_point.ts, timeseries): logging.debug("tracking was restarted, ending trip") return True ongoing_motion_check = len(eaisr.get_ongoing_motion_in_range(prev_point.ts, curr_point.ts, timeseries)) > 0 if timeDelta > 2 * self.time_threshold and not ongoing_motion_check: logging.debug("lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ongoing_motion_in_range = %s, ending trip" % (prev_point.ts, curr_point.ts,self.time_threshold, curr_point.ts - prev_point.ts, ongoing_motion_check)) return True # http://www.huffingtonpost.com/hoppercom/the-worlds-20-longest-non-stop-flights_b_5994268.html # Longest flight is 17 hours, which is the longest you can go without cell reception # And even if you split an air flight that long into two, you will get some untracked time in the # middle, so that's good. TWELVE_HOURS = 12 * 60 * 60 if timeDelta > TWELVE_HOURS: logging.debug("prev_point.ts = %s, curr_point.ts = %s, TWELVE_HOURS = %s, large gap = %s, ending trip" % (prev_point.ts, curr_point.ts, TWELVE_HOURS, curr_point.ts - prev_point.ts)) return True if (timeDelta > 2 * self.time_threshold and # We have been here for a while speedDelta < speedThreshold): # we haven't moved very much logging.debug("prev_point.ts = %s, curr_point.ts = %s, threshold = %s, large gap = %s, ending trip" % (prev_point.ts, curr_point.ts,self.time_threshold, curr_point.ts - prev_point.ts)) return True else: logging.debug("prev_point.ts = %s, curr_point.ts = %s, time gap = %s (vs %s), distance_gap = %s (vs %s), speed_gap = %s (vs %s) continuing trip" % (prev_point.ts, curr_point.ts, timeDelta, self.time_threshold, distDelta, self.distance_threshold, speedDelta, speedThreshold)) # The -30 is a fuzz factor intended to compensate for older clients # where data collection stopped after 5 mins, so that we never actually # see 5 mins of data if (len(last10PointsDistances) < self.point_threshold - 1 or len(last5MinsDistances) == 0 or last5MinTimes.max() < self.time_threshold - 30): logging.debug("Too few points to make a decision, continuing") return False # Normal end-of-trip case logging.debug("last5MinsDistances.max() = %s, last10PointsDistance.max() = %s" % (last5MinsDistances.max(), last10PointsDistances.max())) if (last5MinsDistances.max() < self.distance_threshold and last10PointsDistances.max() < self.distance_threshold): return True
def segment_into_trips(self, timeseries, time_query): """ Examines the timeseries database for a specific range and returns the segmentation points. Note that the input is the entire timeseries and the time range. This allows algorithms to use whatever combination of data that they want from the sensor streams in order to determine the segmentation points. """ filtered_points_df = timeseries.get_data_df( "background/filtered_location", time_query) transition_df = timeseries.get_data_df("statemachine/transition", time_query) if len(filtered_points_df) == 0: self.last_ts_processed = None else: # TODO: Decide whether we should return the write_ts in the entry, # or whether we should search by timestamp instead. # Depends on final direction for the timequery self.last_ts_processed = filtered_points_df.iloc[ -1].metadata_write_ts logging.info("Last ts processed = %s" % self.last_ts_processed) segmentation_points = [] last_trip_end_point = None curr_trip_start_point = None just_ended = True for idx, row in filtered_points_df.iterrows(): currPoint = ad.AttrDict(row) currPoint.update({"idx": idx}) logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30) if curr_trip_start_point is None: logging.debug( "Appending currPoint because the current start point is None" ) # segmentation_points.append(currPoint) if just_ended: lastPoint = ad.AttrDict(filtered_points_df.iloc[idx - 1]) logging.debug( "Comparing with lastPoint = %s, distance = %s, time = %s" % (lastPoint, pf.calDistance( lastPoint, currPoint) < self.distance_threshold, currPoint.ts - lastPoint.ts <= self.time_threshold)) # Unlike the time filter, with the distance filter, we concatenate all points # that are within the distance threshold with the previous trip # end, since because of the distance filter, even noisy points # can occur at an arbitrary time in the future if pf.calDistance(lastPoint, currPoint) < self.distance_threshold: logging.info( "Points %s and %s are within the distance filter so part of the same trip" % (lastPoint, currPoint)) continue # else: # Here's where we deal with the start trip. At this point, the # distance is greater than the filter. sel_point = currPoint logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx)) curr_trip_start_point = sel_point just_ended = False else: # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive. # Using .iloc just ends up including points after this one. # So we reset_index upstream and use it here. last10Points_df = filtered_points_df.iloc[ max(idx - self.point_threshold, curr_trip_start_point.idx):idx + 1] lastPoint = ad.AttrDict(filtered_points_df.iloc[idx - 1]) logging.debug( "lastPoint = %s, time difference = %s dist difference %s" % (lastPoint, currPoint.ts - lastPoint.ts, pf.calDistance(lastPoint, currPoint))) if currPoint.ts - lastPoint.ts > self.time_threshold: # We have been at this location for more than the time filter. # So we must not have been moving for the last _time filter_ # points. So the trip must have ended # Since this is a distance filter, we detect that the last # trip has ended at the time that the new trip starts. So # if the last_trip_end_point is lastPoint, then # curr_trip_start_point should be currPoint. But then we will # have problems with the spurious, noisy points that are # generated until the geofence is turned on, if ever # So we will continue to defer new trip starting until we # have worked through all of those. last_trip_end_point = lastPoint logging.debug( "Appending last_trip_end_point %s with index %s " % (last_trip_end_point, idx - 1)) segmentation_points.append( (curr_trip_start_point, last_trip_end_point)) logging.info("Found trip end at %s" % last_trip_end_point.fmt_time) just_ended = True # Since we only end a trip when we start a new trip, this means that # the last trip that was pushed is ignored. Consider the example of # 2016-02-22 when I took kids to karate. We arrived shortly after 4pm, # so during that remote push, a trip end was not detected. And we got # back home shortly after 5pm, so the trip end was only detected on the # phone at 6pm. At that time, the following points were pushed: # ..., 2016-02-22T16:04:02, 2016-02-22T16:49:34, 2016-02-22T16:49:50, # ..., 2016-02-22T16:57:04 # Then, on the server, while iterating through the points, we detected # a trip end at 16:04, and a new trip start at 16:49. But we did not # detect the trip end at 16:57, because we didn't start a new point. # This has two issues: # - we won't see this trip until the next trip start, which may be on the next day # - we won't see this trip at all, because when we run the pipeline the # next time, we will only look at points from that time onwards. These # points have been marked as "processed", so they won't even be considered. # There are multiple potential fixes: # - we can mark only the completed trips as processed. This will solve (2) above, but not (1) # - we can mark a trip end based on the fact that we only push data # when a trip ends, so if we have data, it means that the trip has been # detected as ended on the phone. # This seems a bit fragile - what if we start pushing incomplete trip # data for efficiency reasons? Therefore, we also check to see if there # is a trip_end_detected in this timeframe after the last point. If so, # then we end the trip at the last point that we have. if not just_ended and len(transition_df) > 0: stopped_moving_after_last = transition_df[ (transition_df.ts > currPoint.ts) & (transition_df.transition == 2)] if len(stopped_moving_after_last) > 0: segmentation_points.append((curr_trip_start_point, currPoint)) return segmentation_points
def end_points_distance(segment): if segment.start == segment.end: raise RuntimeError("This is messed up segment. Investigate further") return pf.calDistance(segment.segment_df.iloc[0], segment.segment_df.iloc[-1])
def end_points_distance(segment): if segment.start == segment.end: raise RuntimeError( "This is messed up segment. Investigate further") return pf.calDistance(segment.segment_df.iloc[0], segment.segment_df.iloc[-1])
def has_trip_ended(self, lastPoint, currPoint, timeseries): # So we must not have been moving for the last _time filter_ # points. So the trip must have ended # Since this is a distance filter, we detect that the last # trip has ended at the time that the new trip starts. So # if the last_trip_end_point is lastPoint, then # curr_trip_start_point should be currPoint. But then we will # have problems with the spurious, noisy points that are # generated until the geofence is turned on, if ever # So we will continue to defer new trip starting until we # have worked through all of those. timeDelta = currPoint.ts - lastPoint.ts distDelta = pf.calDistance(lastPoint, currPoint) logging.debug( "lastPoint = %s, time difference = %s dist difference %s" % (lastPoint, timeDelta, distDelta)) if timeDelta > self.time_threshold: # We have been at this location for more than the time filter. # This could be because we have not been moving for the last # _time filter_ points, or because we didn't get points for # that duration, (e.g. because we were underground) if timeDelta > 0: speedDelta = distDelta / timeDelta else: speedDelta = np.nan # this is way too slow. On ios, we use 5meters in 10 minutes. # On android, we use 10 meters in 5 mins, which seems to work better # for this kind of test speedThreshold = float( self.distance_threshold * 2) / (self.time_threshold / 2) if eaisr.is_tracking_restarted_in_range(lastPoint.ts, currPoint.ts, timeseries): logging.debug("tracking was restarted, ending trip") return True # In general, we get multiple locations between each motion activity. If we see a bunch of motion activities # between two location points, and there is a large gap between the last location and the first # motion activity as well, let us just assume that there was a restart ongoing_motion_check = len( eaisr.get_ongoing_motion_in_range(lastPoint.ts, currPoint.ts, timeseries)) > 0 if timeDelta > self.time_threshold and not ongoing_motion_check: logging.debug( "lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ongoing_motion_in_range = %s, ending trip" % (lastPoint.ts, currPoint.ts, self.time_threshold, currPoint.ts - lastPoint.ts, ongoing_motion_check)) return True # http://www.huffingtonpost.com/hoppercom/the-worlds-20-longest-non-stop-flights_b_5994268.html # Longest flight is 17 hours, which is the longest you can go without cell reception # And even if you split an air flight that long into two, you will get some untracked time in the # middle, so that's good. TWELVE_HOURS = 12 * 60 * 60 if timeDelta > TWELVE_HOURS: logging.debug( "lastPoint.ts = %s, currPoint.ts = %s, TWELVE_HOURS = %s, large gap = %s, ending trip" % (lastPoint.ts, currPoint.ts, TWELVE_HOURS, currPoint.ts - lastPoint.ts)) return True if (timeDelta > self.time_threshold and # We have been here for a while speedDelta < speedThreshold): # we haven't moved very much logging.debug( "lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ending trip" % (lastPoint.ts, currPoint.ts, self.time_threshold, currPoint.ts - lastPoint.ts)) return True else: logging.debug( "lastPoint.ts = %s, currPoint.ts = %s, time gap = %s (vs %s), distance_gap = %s (vs %s), speed_gap = %s (vs %s) continuing trip" % (lastPoint.ts, currPoint.ts, timeDelta, self.time_threshold, distDelta, self.distance_threshold, speedDelta, speedThreshold)) return False
def segment_into_trips(self, timeseries, time_query): """ Examines the timeseries database for a specific range and returns the segmentation points. Note that the input is the entire timeseries and the time range. This allows algorithms to use whatever combination of data that they want from the sensor streams in order to determine the segmentation points. """ filtered_points_df = timeseries.get_data_df("background/filtered_location", time_query) if len(filtered_points_df) == 0: self.last_ts_processed = None else: # TODO: Decide whether we should return the write_ts in the entry, # or whether we should search by timestamp instead. # Depends on final direction for the timequery self.last_ts_processed = filtered_points_df.iloc[-1].metadata_write_ts logging.info("Last ts processed = %s" % self.last_ts_processed) segmentation_points = [] last_trip_end_point = None curr_trip_start_point = None just_ended = True for idx, row in filtered_points_df.iterrows(): currPoint = ad.AttrDict(row) currPoint.update({"idx": idx}) logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30) if curr_trip_start_point is None: logging.debug("Appending currPoint because the current start point is None") # segmentation_points.append(currPoint) if just_ended: lastPoint = ad.AttrDict(filtered_points_df.iloc[idx-1]) logging.debug("Comparing with lastPoint = %s, distance = %s, time = %s" % (lastPoint, pf.calDistance(lastPoint, currPoint) < self.distance_threshold, currPoint.ts - lastPoint.ts <= self.time_threshold)) # Unlike the time filter, with the distance filter, we concatenate all points # that are within the distance threshold with the previous trip # end, since because of the distance filter, even noisy points # can occur at an arbitrary time in the future if pf.calDistance(lastPoint, currPoint) < self.distance_threshold: logging.info("Points %s and %s are within the distance filter so part of the same trip" % (lastPoint, currPoint)) continue # else: # Here's where we deal with the start trip. At this point, the # distance is greater than the filter. sel_point = currPoint logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx)) curr_trip_start_point = sel_point just_ended = False else: # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive. # Using .iloc just ends up including points after this one. # So we reset_index upstream and use it here. last10Points_df = filtered_points_df.iloc[max(idx-self.point_threshold, curr_trip_start_point.idx):idx+1] lastPoint = ad.AttrDict(filtered_points_df.iloc[idx-1]) logging.debug("lastPoint = %s, time difference = %s dist difference %s" % (lastPoint, currPoint.ts - lastPoint.ts, pf.calDistance(lastPoint, currPoint))) if currPoint.ts - lastPoint.ts > self.time_threshold: # We have been at this location for more than the time filter. # So we must not have been moving for the last _time filter_ # points. So the trip must have ended # Since this is a distance filter, we detect that the last # trip has ended at the time that the new trip starts. So # if the last_trip_end_point is lastPoint, then # curr_trip_start_point should be currPoint. But then we will # have problems with the spurious, noisy points that are # generated until the geofence is turned on, if ever # So we will continue to defer new trip starting until we # have worked through all of those. last_trip_end_point = lastPoint logging.debug("Appending last_trip_end_point %s with index %s " % (last_trip_end_point, idx-1)) segmentation_points.append((curr_trip_start_point, last_trip_end_point)) logging.info("Found trip end at %s" % last_trip_end_point.fmt_time) just_ended = True return segmentation_points
def segment_into_trips(self, timeseries, time_query): """ Examines the timeseries database for a specific range and returns the segmentation points. Note that the input is the entire timeseries and the time range. This allows algorithms to use whatever combination of data that they want from the sensor streams in order to determine the segmentation points. """ filtered_points_df = timeseries.get_data_df( "background/filtered_location", time_query) if len(filtered_points_df) == 0: self.last_ts_processed = None else: # TODO: Decide whether we should return the write_ts in the entry, # or whether we should search by timestamp instead. # Depends on final direction for the timequery self.last_ts_processed = filtered_points_df.iloc[ -1].metadata_write_ts logging.info("Last ts processed = %s" % self.last_ts_processed) segmentation_points = [] last_trip_end_point = None curr_trip_start_point = None just_ended = True for idx, row in filtered_points_df.iterrows(): currPoint = ad.AttrDict(row) currPoint.update({"idx": idx}) logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30) if curr_trip_start_point is None: logging.debug( "Appending currPoint because the current start point is None" ) # segmentation_points.append(currPoint) if just_ended: # Normally, at this point, since the logic here and the # logic on the phone are the same, if we have detected a trip # end, any points after this are part of the new trip. # # # However, in some circumstances, notably in my data from 27th # August, there appears to be a mismatch and we get a couple of # points past the end that we detected here. So let's look for # points that are within the distance filter, and are at a # delta of 30 secs, and ignore them instead of using them to # start the new trip prev_point = ad.AttrDict(filtered_points_df.iloc[idx - 1]) logging.debug("Comparing with prev_point = %s" % prev_point) if pf.calDistance(prev_point, currPoint) < self.distance_threshold and \ currPoint.ts - prev_point.ts <= 60: logging.info( "Points %s and %s are within the distance filter and only 1 min apart so part of the same trip" % (prev_point, currPoint)) continue # else: sel_point = currPoint logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx)) curr_trip_start_point = sel_point just_ended = False last5MinsPoints_df = filtered_points_df[np.logical_and( np.logical_and( filtered_points_df.ts > currPoint.ts - self.time_threshold, filtered_points_df.ts < currPoint.ts), filtered_points_df.ts >= curr_trip_start_point.ts)] # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive. # Using .iloc just ends up including points after this one. # So we reset_index upstream and use it here. # We are going to use the last 8 points for now. # TODO: Change this back to last 10 points once we normalize phone and this last10Points_df = filtered_points_df.iloc[ max(idx - self.point_threshold, curr_trip_start_point.idx):idx + 1] distanceToLast = lambda (row): pf.calDistance( ad.AttrDict(row), currPoint) last5MinsDistances = last5MinsPoints_df.apply(distanceToLast, axis=1) logging.debug( "last5MinsDistances = %s with length %d" % (last5MinsDistances.as_matrix(), len(last5MinsDistances))) last10PointsDistances = last10Points_df.apply(distanceToLast, axis=1) logging.debug( "last10PointsDistances = %s with length %d, shape %s" % (last10PointsDistances.as_matrix(), len(last10PointsDistances), last10PointsDistances.shape)) logging.debug( "len(last10PointsDistances) = %d, len(last5MinsDistances) = %d" % (len(last10PointsDistances), len(last5MinsDistances))) if (len(last10PointsDistances) < self.point_threshold - 1 or len(last5MinsDistances) == 0): logging.debug("Too few points to make a decision, continuing") else: logging.debug( "last5MinsDistances.max() = %s, last10PointsDistance.max() = %s" % (last5MinsDistances.max(), last10PointsDistances.max())) if (last5MinsDistances.max() < self.distance_threshold and last10PointsDistances.max() < self.distance_threshold): last_trip_end_index = int( min(np.median(last5MinsPoints_df.index), np.median(last10Points_df.index))) # logging.debug("last5MinPoints.median = %s (%s), last10Points_df = %s (%s), sel index = %s" % # (np.median(last5MinsPoints_df.index), last5MinsPoints_df.index, # np.median(last10Points_df.index), last10Points_df.index, # last_trip_end_index)) last_trip_end_point_row = filtered_points_df.iloc[ last_trip_end_index] last_trip_end_point = ad.AttrDict( filtered_points_df.iloc[last_trip_end_index]) logging.debug( "Appending last_trip_end_point %s with index %s " % (last_trip_end_point, last_trip_end_point_row.name)) segmentation_points.append( (curr_trip_start_point, last_trip_end_point)) logging.info("Found trip end at %s" % last_trip_end_point.fmt_time) just_ended = True return segmentation_points
def segment_into_trips(self, timeseries, time_query): """ Examines the timeseries database for a specific range and returns the segmentation points. Note that the input is the entire timeseries and the time range. This allows algorithms to use whatever combination of data that they want from the sensor streams in order to determine the segmentation points. """ filtered_points_df = timeseries.get_data_df("background/filtered_location", time_query) transition_df = timeseries.get_data_df("statemachine/transition", time_query) if len(filtered_points_df) == 0: self.last_ts_processed = None else: # TODO: Decide whether we should return the write_ts in the entry, # or whether we should search by timestamp instead. # Depends on final direction for the timequery self.last_ts_processed = filtered_points_df.iloc[-1].metadata_write_ts logging.info("Last ts processed = %s" % self.last_ts_processed) segmentation_points = [] last_trip_end_point = None curr_trip_start_point = None just_ended = True for idx, row in filtered_points_df.iterrows(): currPoint = ad.AttrDict(row) currPoint.update({"idx": idx}) logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30) if curr_trip_start_point is None: logging.debug("Appending currPoint because the current start point is None") # segmentation_points.append(currPoint) if just_ended: lastPoint = ad.AttrDict(filtered_points_df.iloc[idx-1]) logging.debug("Comparing with lastPoint = %s, distance = %s, time = %s" % (lastPoint, pf.calDistance(lastPoint, currPoint) < self.distance_threshold, currPoint.ts - lastPoint.ts <= self.time_threshold)) # Unlike the time filter, with the distance filter, we concatenate all points # that are within the distance threshold with the previous trip # end, since because of the distance filter, even noisy points # can occur at an arbitrary time in the future if pf.calDistance(lastPoint, currPoint) < self.distance_threshold: logging.info("Points %s and %s are within the distance filter so part of the same trip" % (lastPoint, currPoint)) continue # else: # Here's where we deal with the start trip. At this point, the # distance is greater than the filter. sel_point = currPoint logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx)) curr_trip_start_point = sel_point just_ended = False else: # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive. # Using .iloc just ends up including points after this one. # So we reset_index upstream and use it here. last10Points_df = filtered_points_df.iloc[max(idx-self.point_threshold, curr_trip_start_point.idx):idx+1] lastPoint = ad.AttrDict(filtered_points_df.iloc[idx-1]) logging.debug("lastPoint = %s, time difference = %s dist difference %s" % (lastPoint, currPoint.ts - lastPoint.ts, pf.calDistance(lastPoint, currPoint))) if currPoint.ts - lastPoint.ts > self.time_threshold: # We have been at this location for more than the time filter. # So we must not have been moving for the last _time filter_ # points. So the trip must have ended # Since this is a distance filter, we detect that the last # trip has ended at the time that the new trip starts. So # if the last_trip_end_point is lastPoint, then # curr_trip_start_point should be currPoint. But then we will # have problems with the spurious, noisy points that are # generated until the geofence is turned on, if ever # So we will continue to defer new trip starting until we # have worked through all of those. last_trip_end_point = lastPoint logging.debug("Appending last_trip_end_point %s with index %s " % (last_trip_end_point, idx-1)) segmentation_points.append((curr_trip_start_point, last_trip_end_point)) logging.info("Found trip end at %s" % last_trip_end_point.fmt_time) just_ended = True # Since we only end a trip when we start a new trip, this means that # the last trip that was pushed is ignored. Consider the example of # 2016-02-22 when I took kids to karate. We arrived shortly after 4pm, # so during that remote push, a trip end was not detected. And we got # back home shortly after 5pm, so the trip end was only detected on the # phone at 6pm. At that time, the following points were pushed: # ..., 2016-02-22T16:04:02, 2016-02-22T16:49:34, 2016-02-22T16:49:50, # ..., 2016-02-22T16:57:04 # Then, on the server, while iterating through the points, we detected # a trip end at 16:04, and a new trip start at 16:49. But we did not # detect the trip end at 16:57, because we didn't start a new point. # This has two issues: # - we won't see this trip until the next trip start, which may be on the next day # - we won't see this trip at all, because when we run the pipeline the # next time, we will only look at points from that time onwards. These # points have been marked as "processed", so they won't even be considered. # There are multiple potential fixes: # - we can mark only the completed trips as processed. This will solve (2) above, but not (1) # - we can mark a trip end based on the fact that we only push data # when a trip ends, so if we have data, it means that the trip has been # detected as ended on the phone. # This seems a bit fragile - what if we start pushing incomplete trip # data for efficiency reasons? Therefore, we also check to see if there # is a trip_end_detected in this timeframe after the last point. If so, # then we end the trip at the last point that we have. if not just_ended and len(transition_df) > 0: stopped_moving_after_last = transition_df[(transition_df.ts > currPoint.ts) & (transition_df.transition == 2)] if len(stopped_moving_after_last) > 0: segmentation_points.append((curr_trip_start_point, currPoint)) return segmentation_points
def has_trip_ended(self, prev_point, curr_point, timeseries, last10PointsDistances, last5MinsDistances, last5MinTimes): # Another mismatch between phone and server. Phone stops tracking too soon, # so the distance is still greater than the threshold at the end of the trip. # But then the next point is a long time away, so we can split again (similar to a distance filter) if prev_point is None: logging.debug("prev_point is None, continuing trip") else: timeDelta = curr_point.ts - prev_point.ts distDelta = pf.calDistance(prev_point, curr_point) if timeDelta > 0: speedDelta = old_div(distDelta, timeDelta) else: speedDelta = np.nan speedThreshold = old_div(float(self.distance_threshold), self.time_threshold) if eaisr.is_tracking_restarted_in_range(prev_point.ts, curr_point.ts, timeseries): logging.debug("tracking was restarted, ending trip") return True ongoing_motion_check = len( eaisr.get_ongoing_motion_in_range(prev_point.ts, curr_point.ts, timeseries)) > 0 if timeDelta > 2 * self.time_threshold and not ongoing_motion_check: logging.debug( "lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ongoing_motion_in_range = %s, ending trip" % (prev_point.ts, curr_point.ts, self.time_threshold, curr_point.ts - prev_point.ts, ongoing_motion_check)) return True # http://www.huffingtonpost.com/hoppercom/the-worlds-20-longest-non-stop-flights_b_5994268.html # Longest flight is 17 hours, which is the longest you can go without cell reception # And even if you split an air flight that long into two, you will get some untracked time in the # middle, so that's good. TWELVE_HOURS = 12 * 60 * 60 if timeDelta > TWELVE_HOURS: logging.debug( "prev_point.ts = %s, curr_point.ts = %s, TWELVE_HOURS = %s, large gap = %s, ending trip" % (prev_point.ts, curr_point.ts, TWELVE_HOURS, curr_point.ts - prev_point.ts)) return True if (timeDelta > 2 * self.time_threshold and # We have been here for a while speedDelta < speedThreshold): # we haven't moved very much logging.debug( "prev_point.ts = %s, curr_point.ts = %s, threshold = %s, large gap = %s, ending trip" % (prev_point.ts, curr_point.ts, self.time_threshold, curr_point.ts - prev_point.ts)) return True else: logging.debug( "prev_point.ts = %s, curr_point.ts = %s, time gap = %s (vs %s), distance_gap = %s (vs %s), speed_gap = %s (vs %s) continuing trip" % (prev_point.ts, curr_point.ts, timeDelta, self.time_threshold, distDelta, self.distance_threshold, speedDelta, speedThreshold)) # The -30 is a fuzz factor intended to compensate for older clients # where data collection stopped after 5 mins, so that we never actually # see 5 mins of data if (len(last10PointsDistances) < self.point_threshold - 1 or len(last5MinsDistances) == 0 or last5MinTimes.max() < self.time_threshold - 30): logging.debug("Too few points to make a decision, continuing") return False # Normal end-of-trip case logging.debug( "last5MinsDistances.max() = %s, last10PointsDistance.max() = %s" % (last5MinsDistances.max(), last10PointsDistances.max())) if (last5MinsDistances.max() < self.distance_threshold and last10PointsDistances.max() < self.distance_threshold): return True
def segment_into_trips(self, timeseries, time_query): """ Examines the timeseries database for a specific range and returns the segmentation points. Note that the input is the entire timeseries and the time range. This allows algorithms to use whatever combination of data that they want from the sensor streams in order to determine the segmentation points. """ filtered_points_pre_ts_diff_df = timeseries.get_data_df( "background/filtered_location", time_query) # Sometimes, we can get bogus points because data.ts and # metadata.write_ts are off by a lot. If we don't do this, we end up # appearing to travel back in time # https://github.com/e-mission/e-mission-server/issues/457 filtered_points_df = filtered_points_pre_ts_diff_df[( filtered_points_pre_ts_diff_df.metadata_write_ts - filtered_points_pre_ts_diff_df.ts) < 1000] filtered_points_df.reset_index(inplace=True) transition_df = timeseries.get_data_df("statemachine/transition", time_query) if len(transition_df) > 0: logging.debug("transition_df = %s" % transition_df[["fmt_time", "transition"]]) else: logging.debug( "no transitions found. This can happen for continuous sensing") self.last_ts_processed = None logging.info("Last ts processed = %s" % self.last_ts_processed) segmentation_points = [] last_trip_end_point = None curr_trip_start_point = None just_ended = True prevPoint = None for idx, row in filtered_points_df.iterrows(): currPoint = ad.AttrDict(row) currPoint.update({"idx": idx}) logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30) if curr_trip_start_point is None: logging.debug( "Appending currPoint because the current start point is None" ) # segmentation_points.append(currPoint) if just_ended: if self.continue_just_ended(idx, currPoint, filtered_points_df): # We have "processed" the currPoint by deciding to glom it self.last_ts_processed = currPoint.metadata_write_ts continue # else: sel_point = currPoint logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx)) curr_trip_start_point = sel_point just_ended = False last5MinsPoints_df = filtered_points_df[np.logical_and( np.logical_and( filtered_points_df.ts > currPoint.ts - self.time_threshold, filtered_points_df.ts < currPoint.ts), filtered_points_df.ts >= curr_trip_start_point.ts)] # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive. # Using .iloc just ends up including points after this one. # So we reset_index upstream and use it here. # We are going to use the last 8 points for now. # TODO: Change this back to last 10 points once we normalize phone and this last10Points_df = filtered_points_df.iloc[ max(idx - self.point_threshold, curr_trip_start_point.idx):idx + 1] distanceToLast = lambda row: pf.calDistance( ad.AttrDict(row), currPoint) timeToLast = lambda row: currPoint.ts - ad.AttrDict(row).ts last5MinsDistances = last5MinsPoints_df.apply(distanceToLast, axis=1) logging.debug( "last5MinsDistances = %s with length %d" % (last5MinsDistances.to_numpy(), len(last5MinsDistances))) last10PointsDistances = last10Points_df.apply(distanceToLast, axis=1) logging.debug( "last10PointsDistances = %s with length %d, shape %s" % (last10PointsDistances.to_numpy(), len(last10PointsDistances), last10PointsDistances.shape)) # Fix for https://github.com/e-mission/e-mission-server/issues/348 last5MinTimes = last5MinsPoints_df.apply(timeToLast, axis=1) logging.debug( "len(last10PointsDistances) = %d, len(last5MinsDistances) = %d" % (len(last10PointsDistances), len(last5MinsDistances))) logging.debug( "last5MinsTimes.max() = %s, time_threshold = %s" % (last5MinTimes.max() if len(last5MinTimes) > 0 else np.NaN, self.time_threshold)) if self.has_trip_ended(prevPoint, currPoint, timeseries, last10PointsDistances, last5MinsDistances, last5MinTimes): (ended_before_this, last_trip_end_point) = self.get_last_trip_end_point( filtered_points_df, last10Points_df, last5MinsPoints_df) segmentation_points.append( (curr_trip_start_point, last_trip_end_point)) logging.info("Found trip end at %s" % last_trip_end_point.fmt_time) # We have processed everything up to the trip end by marking it as a completed trip self.last_ts_processed = currPoint.metadata_write_ts if ended_before_this: # in this case, we end a trip at the previous point, and the next trip starts at this # point, not the next one just_ended = False prevPoint = currPoint curr_trip_start_point = currPoint logging.debug( "Setting new trip start point %s with idx %s" % (currPoint, currPoint.idx)) else: # We end a trip at the current point, and the next trip starts at the next point just_ended = True prevPoint = None else: prevPoint = currPoint logging.debug( "Iterated over all points, just_ended = %s, len(transition_df) = %s" % (just_ended, len(transition_df))) if not just_ended and len(transition_df) > 0: stopped_moving_after_last = transition_df[ (transition_df.ts > currPoint.ts) & (transition_df.transition == 2)] logging.debug("looking after %s, found transitions %s" % (currPoint.ts, stopped_moving_after_last)) if len(stopped_moving_after_last) > 0: (unused, last_trip_end_point) = self.get_last_trip_end_point( filtered_points_df, last10Points_df, None) segmentation_points.append( (curr_trip_start_point, last_trip_end_point)) logging.debug("Found trip end at %s" % last_trip_end_point.fmt_time) # We have processed everything up to the trip end by marking it as a completed trip self.last_ts_processed = currPoint.metadata_write_ts return segmentation_points
def segment_into_trips(self, timeseries, time_query): """ Examines the timeseries database for a specific range and returns the segmentation points. Note that the input is the entire timeseries and the time range. This allows algorithms to use whatever combination of data that they want from the sensor streams in order to determine the segmentation points. """ filtered_points_df = timeseries.get_data_df("background/filtered_location", time_query) if len(filtered_points_df) == 0: self.last_ts_processed = None else: # TODO: Decide whether we should return the write_ts in the entry, # or whether we should search by timestamp instead. # Depends on final direction for the timequery self.last_ts_processed = filtered_points_df.iloc[-1].metadata_write_ts logging.info("Last ts processed = %s" % self.last_ts_processed) segmentation_points = [] last_trip_end_point = None curr_trip_start_point = None just_ended = True for idx, row in filtered_points_df.iterrows(): currPoint = ad.AttrDict(row) currPoint.update({"idx": idx}) logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30) if curr_trip_start_point is None: logging.debug("Appending currPoint because the current start point is None") # segmentation_points.append(currPoint) if just_ended: # Normally, at this point, since the logic here and the # logic on the phone are the same, if we have detected a trip # end, any points after this are part of the new trip. # # # However, in some circumstances, notably in my data from 27th # August, there appears to be a mismatch and we get a couple of # points past the end that we detected here. So let's look for # points that are within the distance filter, and are at a # delta of 30 secs, and ignore them instead of using them to # start the new trip prev_point = ad.AttrDict(filtered_points_df.iloc[idx - 1]) logging.debug("Comparing with prev_point = %s" % prev_point) if pf.calDistance(prev_point, currPoint) < self.distance_threshold and \ currPoint.ts - prev_point.ts <= 60: logging.info("Points %s and %s are within the distance filter and only 1 min apart so part of the same trip" % (prev_point, currPoint)) continue # else: sel_point = currPoint logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx)) curr_trip_start_point = sel_point just_ended = False last5MinsPoints_df = filtered_points_df[np.logical_and( np.logical_and( filtered_points_df.ts > currPoint.ts - self.time_threshold, filtered_points_df.ts < currPoint.ts), filtered_points_df.ts >= curr_trip_start_point.ts)] # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive. # Using .iloc just ends up including points after this one. # So we reset_index upstream and use it here. # We are going to use the last 8 points for now. # TODO: Change this back to last 10 points once we normalize phone and this last10Points_df = filtered_points_df.iloc[max(idx-self.point_threshold, curr_trip_start_point.idx):idx+1] distanceToLast = lambda(row): pf.calDistance(ad.AttrDict(row), currPoint) last5MinsDistances = last5MinsPoints_df.apply(distanceToLast, axis=1) logging.debug("last5MinsDistances = %s with length %d" % (last5MinsDistances.as_matrix(), len(last5MinsDistances))) last10PointsDistances = last10Points_df.apply(distanceToLast, axis=1) logging.debug("last10PointsDistances = %s with length %d, shape %s" % (last10PointsDistances.as_matrix(), len(last10PointsDistances), last10PointsDistances.shape)) logging.debug("len(last10PointsDistances) = %d, len(last5MinsDistances) = %d" % (len(last10PointsDistances), len(last5MinsDistances))) if (len(last10PointsDistances) < self.point_threshold - 1 or len(last5MinsDistances) == 0): logging.debug("Too few points to make a decision, continuing") else: logging.debug("last5MinsDistances.max() = %s, last10PointsDistance.max() = %s" % (last5MinsDistances.max(), last10PointsDistances.max())) if (last5MinsDistances.max() < self.distance_threshold and last10PointsDistances.max() < self.distance_threshold): last_trip_end_index = int(min(np.median(last5MinsPoints_df.index), np.median(last10Points_df.index))) # logging.debug("last5MinPoints.median = %s (%s), last10Points_df = %s (%s), sel index = %s" % # (np.median(last5MinsPoints_df.index), last5MinsPoints_df.index, # np.median(last10Points_df.index), last10Points_df.index, # last_trip_end_index)) last_trip_end_point_row = filtered_points_df.iloc[last_trip_end_index] last_trip_end_point = ad.AttrDict(filtered_points_df.iloc[last_trip_end_index]) logging.debug("Appending last_trip_end_point %s with index %s " % (last_trip_end_point, last_trip_end_point_row.name)) segmentation_points.append((curr_trip_start_point, last_trip_end_point)) logging.info("Found trip end at %s" % last_trip_end_point.fmt_time) just_ended = True return segmentation_points