def testGetTimeRangeForTrip(self): new_trip = self.create_fake_trip() ret_tq = esda.get_time_query_for_trip_like(esda.RAW_TRIP_KEY, new_trip.get_id()) self.assertEqual(ret_tq.timeType, "data.ts") self.assertEqual(ret_tq.startTs, 5) self.assertEqual(ret_tq.endTs, 6)
def testPointFilteringRichmondJump(self): classicJumpTrip1 = self.trip_entries[6] self.loadPointsForTrip(classicJumpTrip1.get_id()) classicJumpSections1 = [s for s in self.section_entries if s.data.trip_id == classicJumpTrip1.get_id()] outlier_algo = eaics.BoxplotOutlier() jump_algo = eaicj.SmoothZigzag(False, 100) for i, section_entry in enumerate(classicJumpSections1): logging.debug("-" * 20 + "Considering section %s" % i + "-" * 20) section_df = self.ts.get_data_df("background/filtered_location", esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section_entry.get_id())) with_speeds_df = eaicl.add_dist_heading_speed(section_df) maxSpeed = outlier_algo.get_threshold(with_speeds_df) logging.debug("Max speed for section %s = %s" % (i, maxSpeed)) jump_algo.filter(with_speeds_df) logging.debug("Retaining points %s" % np.nonzero(jump_algo.inlier_mask_)) to_delete_mask = np.logical_not(jump_algo.inlier_mask_) logging.debug("Deleting points %s" % np.nonzero(to_delete_mask)) delete_ids = list(with_speeds_df[to_delete_mask]._id) logging.debug("Deleting ids %s" % delete_ids) # There is only one section self.assertEqual(i, 0) # The bad section, should have the third point filtered self.assertEqual(np.count_nonzero(to_delete_mask), 1) self.assertEqual([str(id) for id in delete_ids], ["55e86dbb7d65cb39ee987e09"])
def section_to_geojson(section, tl): """ This is the trickiest part of the visualization. The section is basically a collection of points with a line through them. So the representation is a feature in which one feature which is the line, and one feature collection which is the set of point features. :param section: the section to be converted :return: a feature collection which is the geojson version of the section """ ts = esta.TimeSeries.get_time_series(section.user_id) entry_it = ts.find_entries(["analysis/recreated_location"], esda.get_time_query_for_trip_like( "analysis/cleaned_section", section.get_id())) # TODO: Decide whether we want to use Rewrite to use dataframes throughout instead of python arrays. # dataframes insert nans. We could use fillna to fill with default values, but if we are not actually # using dataframe features here, it is unclear how much that would help. feature_array = [] section_location_entries = [ecwe.Entry(entry) for entry in entry_it] if len(section_location_entries) != 0: logging.debug("first element in section_location_array = %s" % section_location_entries[0]) # Fudge the end point so that we don't have a gap because of the ts != write_ts mismatch # TODO: Fix this once we are able to query by the data timestamp instead of the metadata ts if section_location_entries[-1].data.loc != section.data.end_loc: logging.info("section_location_array[-1].data.loc %s != section.data.end_loc %s even after df.ts fix, filling gap" % \ (section_location_entries[-1].data.loc, section.data.end_loc)) last_loc_doc = ts.get_entry_at_ts("background/filtered_location", "data.ts", section.data.end_ts) if last_loc_doc is None: logging.warning("can't find entry to patch gap, leaving gap") else: last_loc_entry = ecwe.Entry(last_loc_doc) logging.debug("Adding new entry %s to fill the end point gap between %s and %s" % (last_loc_entry.data.loc, section_location_entries[-1].data.loc, section.data.end_loc)) section_location_entries.append(last_loc_entry) points_line_feature = point_array_to_line(section_location_entries) # If this is the first section, we already start from the trip start. But we actually need to start from the # prior place. Fudge this too. Note also that we may want to figure out how to handle this properly in the model # without needing fudging. TODO: Unclear how exactly to do this if section.data.start_stop is None: # This is the first section. So we need to find the start place of the parent trip parent_trip = tl.get_object(section.data.trip_id) start_place_of_parent_trip = tl.get_object(parent_trip.data.start_place) points_line_feature.geometry.coordinates.insert(0, start_place_of_parent_trip.data.location.coordinates) points_line_feature.id = str(section.get_id()) points_line_feature.properties = copy.copy(section.data) points_line_feature.properties["feature_type"] = "section" points_line_feature.properties["sensed_mode"] = str(points_line_feature.properties.sensed_mode) _del_non_derializable(points_line_feature.properties, ["start_loc", "end_loc"]) # feature_array.append(gj.FeatureCollection(points_feature_array)) feature_array.append(points_line_feature) return gj.FeatureCollection(feature_array)
def filter_jumps(user_id, section_id): """ filters out any jumps in the points related to this section and stores a entry that lists the deleted points for this trip and this section. :param user_id: the user id to filter the trips for :param section_id: the section_id to filter the trips for :return: none. saves an entry with the filtered points into the database. """ logging.debug("filter_jumps(%s, %s) called" % (user_id, section_id)) outlier_algo = eaico.BoxplotOutlier() filtering_algo = eaicj.SmoothZigzag() tq = esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section_id) ts = esta.TimeSeries.get_time_series(user_id) section_points_df = ts.get_data_df("background/filtered_location", tq) logging.debug("len(section_points_df) = %s" % len(section_points_df)) points_to_ignore_df = get_points_to_filter(section_points_df, outlier_algo, filtering_algo) if points_to_ignore_df is None: # There were no points to delete return deleted_point_id_list = list(points_to_ignore_df._id) logging.debug("deleted %s points" % len(deleted_point_id_list)) filter_result = ecws.Smoothresults() filter_result.section = section_id filter_result.deleted_points = deleted_point_id_list filter_result.outlier_algo = "BoxplotOutlier" filter_result.filtering_algo = "SmoothZigzag" result_entry = ecwe.Entry.create_entry(user_id, "analysis/smoothing", filter_result) ts.insert(result_entry)
def filter_jumps(user_id, section_id): """ filters out any jumps in the points related to this section and stores a entry that lists the deleted points for this trip and this section. :param user_id: the user id to filter the trips for :param section_id: the section_id to filter the trips for :return: none. saves an entry with the filtered points into the database. """ logging.debug("filter_jumps(%s, %s) called" % (user_id, section_id)) outlier_algo = eaico.BoxplotOutlier() filtering_algo = eaicj.SmoothZigzag() tq = esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section_id) ts = esta.TimeSeries.get_time_series(user_id) section_points_df = ts.get_data_df("background/filtered_location", tq) logging.debug("len(section_points_df) = %s" % len(section_points_df)) points_to_ignore_df = get_points_to_filter(section_points_df, outlier_algo, filtering_algo) if points_to_ignore_df is None: # There were no points to delete return deleted_point_id_list = list(points_to_ignore_df._id) logging.debug("deleted %s points" % len(deleted_point_id_list)) filter_result = ecws.Smoothresults() filter_result.section = section_id filter_result.deleted_points = deleted_point_id_list filter_result.outlier_algo = "BoxplotOutlier" filter_result.filtering_algo = "SmoothZigzag" result_entry = ecwe.Entry.create_entry(user_id, "analysis/smoothing", filter_result) ts.insert(result_entry)
def testPointFilteringZigzag(self): classicJumpTrip1 = self.trip_entries[8] self.loadPointsForTrip(classicJumpTrip1.get_id()) classicJumpSections1 = [ s for s in self.section_entries if s.data.trip_id == classicJumpTrip1.get_id() ] outlier_algo = eaics.BoxplotOutlier() jump_algo = eaicj.SmoothZigzag(False, 100) for i, section_entry in enumerate(classicJumpSections1): logging.debug("-" * 20 + "Considering section %s" % i + "-" * 20) section_df = self.ts.get_data_df( "background/filtered_location", esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section_entry.get_id())) with_speeds_df = eaicl.add_dist_heading_speed(section_df) maxSpeed = outlier_algo.get_threshold(with_speeds_df) logging.debug("Max speed for section %s = %s" % (i, maxSpeed)) jump_algo.filter(with_speeds_df) logging.debug("Retaining points %s" % np.nonzero(jump_algo.inlier_mask_.to_numpy())) to_delete_mask = np.logical_not(jump_algo.inlier_mask_) logging.debug("Deleting points %s" % np.nonzero(to_delete_mask.to_numpy())) delete_ids = list(with_speeds_df[to_delete_mask]._id) logging.debug("Deleting ids %s" % delete_ids) if i == 0: # this is the zigzag section self.assertEqual( np.nonzero(to_delete_mask.to_numpy())[0].tolist(), [25, 64, 114, 115, 116, 117, 118, 119, 120, 123, 126]) self.assertEqual(delete_ids, [ boi.ObjectId('55edafe77d65cb39ee9882ff'), boi.ObjectId('55edcc157d65cb39ee98836e'), boi.ObjectId('55edcc1f7d65cb39ee988400'), boi.ObjectId('55edcc1f7d65cb39ee988403'), boi.ObjectId('55edcc1f7d65cb39ee988406'), boi.ObjectId('55edcc1f7d65cb39ee988409'), boi.ObjectId('55edcc1f7d65cb39ee98840c'), boi.ObjectId('55edcc207d65cb39ee988410'), boi.ObjectId('55edcc207d65cb39ee988412'), boi.ObjectId('55edcc217d65cb39ee98841f'), boi.ObjectId('55edcc217d65cb39ee988429') ]) else: self.assertEqual(len(np.nonzero(to_delete_mask.to_numpy())[0]), 0) self.assertEqual(len(delete_ids), 0)
def section_to_geojson(section, tl): """ This is the trickiest part of the visualization. The section is basically a collection of points with a line through them. So the representation is a feature in which one feature which is the line, and one feature collection which is the set of point features. :param section: the section to be converted :return: a feature collection which is the geojson version of the section """ ts = esta.TimeSeries.get_time_series(section.user_id) entry_it = ts.find_entries(["analysis/recreated_location"], esda.get_time_query_for_trip_like( "analysis/cleaned_section", section.get_id())) # TODO: Decide whether we want to use Rewrite to use dataframes throughout instead of python arrays. # dataframes insert nans. We could use fillna to fill with default values, but if we are not actually # using dataframe features here, it is unclear how much that would help. feature_array = [] section_location_entries = [ecwe.Entry(entry) for entry in entry_it] if len(section_location_entries) != 0: logging.debug("first element in section_location_array = %s" % section_location_entries[0]) if not ecc.compare_rounded_arrays(section.data.end_loc.coordinates, section_location_entries[-1].data.loc.coordinates, digits=4): logging.info("section_location_array[-1].data.loc %s != section.data.end_loc %s even after df.ts fix, filling gap" % \ (section_location_entries[-1].data.loc, section.data.end_loc)) assert(False) last_loc_doc = ts.get_entry_at_ts("background/filtered_location", "data.ts", section.data.end_ts) if last_loc_doc is None: logging.warning("can't find entry to patch gap, leaving gap") else: last_loc_entry = ecwe.Entry(last_loc_doc) logging.debug("Adding new entry %s to fill the end point gap between %s and %s" % (last_loc_entry.data.loc, section_location_entries[-1].data.loc, section.data.end_loc)) section_location_entries.append(last_loc_entry) points_line_feature = point_array_to_line(section_location_entries) points_line_feature.id = str(section.get_id()) points_line_feature.properties.update(copy.copy(section.data)) # Update works on dicts, convert back to a section object to make the modes # work properly points_line_feature.properties = ecwcs.Cleanedsection(points_line_feature.properties) points_line_feature.properties["feature_type"] = "section" points_line_feature.properties["sensed_mode"] = str(points_line_feature.properties.sensed_mode) _del_non_derializable(points_line_feature.properties, ["start_loc", "end_loc"]) # feature_array.append(gj.FeatureCollection(points_feature_array)) feature_array.append(points_line_feature) return gj.FeatureCollection(feature_array)
def testRemoveAllOutliers(self): etc.setupRealExample( self, "emission/tests/data/real_examples/shankari_2016-06-20") self.ts = esta.TimeSeries.get_time_series(self.testUUID) eaist.segment_current_trips(self.testUUID) eaiss.segment_current_sections(self.testUUID) eaicl.filter_current_sections(self.testUUID) # get all sections sections = [ ecwe.Entry(s) for s in self.ts.find_entries([esda.RAW_SECTION_KEY], time_query=None) ] for section in sections: filtered_points_entry_doc = self.ts.get_entry_at_ts( "analysis/smoothing", "data.section", section.get_id()) if filtered_points_entry_doc is not None: logging.debug("Found smoothing result for section %s" % section.get_id()) # Setting the set of deleted points to everything loc_tq = esda.get_time_query_for_trip_like( esda.RAW_SECTION_KEY, section.get_id()) loc_df = self.ts.get_data_df("background/filtered_location", loc_tq) filtered_points_entry_doc["data"]["deleted_points"] = loc_df[ "_id"].tolist() self.ts.update(ecwe.Entry(filtered_points_entry_doc)) # All we care is that this should not crash. eaicr.clean_and_resample(self.testUUID) # Most of the trips have zero length, but apparently one has non-zero length # because the stop length is non zero!! # So there is only one cleaned trip left cleaned_trips_df = self.ts.get_data_df(esda.CLEANED_TRIP_KEY, time_query=None) self.assertEqual(len(cleaned_trips_df), 1) # We don't support squishing sections, but we only store stops and sections # for non-squished trips. And this non-squished trip happens to have # two sections and one stop cleaned_sections_df = self.ts.get_data_df(esda.CLEANED_SECTION_KEY, time_query=None) self.assertEqual(len(cleaned_sections_df), 2) self.assertEqual(cleaned_sections_df.distance.tolist(), [0, 0]) cleaned_stops_df = self.ts.get_data_df(esda.CLEANED_STOP_KEY, time_query=None) self.assertEqual(len(cleaned_stops_df), 1) self.assertAlmostEqual(cleaned_stops_df.distance[0], 3252, places=0)
def get_filtered_points(section, filtered_section_data): logging.debug("Getting filtered points for section %s" % section) ts = esta.TimeSeries.get_time_series(section.user_id) loc_entry_it = ts.find_entries(["background/filtered_location"], esda.get_time_query_for_trip_like( esda.RAW_SECTION_KEY, section.get_id())) loc_entry_list = [ecwe.Entry(e) for e in loc_entry_it] # We know that the assertion fails in the geojson conversion code and we # handle it there, so we are just going to comment this out for now. # assert (loc_entry_list[-1].data.loc == section.data.end_loc, # "section_location_array[-1].loc != section.end_loc even after df.ts fix", # (loc_entry_list[-1].data.loc, section.data.end_loc)) # Find the list of points to filter filtered_points_entry_doc = ts.get_entry_at_ts("analysis/smoothing", "data.section", section.get_id()) if filtered_points_entry_doc is None: logging.debug( "No filtered_points_entry, filtered_points_list is empty") filtered_point_id_list = [] else: # TODO: Figure out how to make collections work for the wrappers and then change this to an Entry filtered_points_entry = ad.AttrDict(filtered_points_entry_doc) filtered_point_id_list = list( filtered_points_entry.data.deleted_points) logging.debug("deleting %s points from section points" % len(filtered_point_id_list)) filtered_loc_list = remove_outliers(loc_entry_list, filtered_point_id_list) # filtered_loc_list has removed the outliers. Now, we resample the data at # 30 sec intervals resampled_loc_df = resample(filtered_loc_list, interval=30) # If this is the first section, we need to find the start place of the parent trip # and actually start from there. That will fix the distances but not the duration # because we haven't yet figured out how to get the correct start time. # TODO: Fix this!! # For now, we will fudge this in the geojson converter, as always with_speeds_df = eaicl.add_dist_heading_speed(resampled_loc_df) with_speeds_df["idx"] = np.arange(0, len(with_speeds_df)) with_speeds_df_nona = with_speeds_df.dropna() logging.info("removed %d entries containing n/a" % (len(with_speeds_df_nona) - len(with_speeds_df))) return with_speeds_df_nona
def filter_jumps(user_id, section_id): """ filters out any jumps in the points related to this section and stores a entry that lists the deleted points for this trip and this section. :param user_id: the user id to filter the trips for :param section_id: the section_id to filter the trips for :return: none. saves an entry with the filtered points into the database. """ logging.debug("filter_jumps(%s, %s) called" % (user_id, section_id)) outlier_algo = eaico.BoxplotOutlier() tq = esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section_id) ts = esta.TimeSeries.get_time_series(user_id) section_points_df = ts.get_data_df("background/filtered_location", tq) is_ios = section_points_df["filter"].dropna().unique().tolist() == [ "distance" ] if is_ios: logging.debug("Found iOS section, filling in gaps with fake data") section_points_df = _ios_fill_fake_data(section_points_df) filtering_algo = eaicj.SmoothZigzag(is_ios, DEFAULT_SAME_POINT_DISTANCE) logging.debug("len(section_points_df) = %s" % len(section_points_df)) points_to_ignore_df = get_points_to_filter(section_points_df, outlier_algo, filtering_algo) if points_to_ignore_df is None: # There were no points to delete return points_to_ignore_df_filtered = points_to_ignore_df._id.dropna() logging.debug( "after filtering ignored points, %s -> %s" % (len(points_to_ignore_df), len(points_to_ignore_df_filtered))) # We shouldn't really filter any fuzzed points because they represent 100m in 60 secs # but let's actually check for that # assert len(points_to_ignore_df) == len(points_to_ignore_df_filtered) deleted_point_id_list = list(points_to_ignore_df_filtered) logging.debug("deleted %s points" % len(deleted_point_id_list)) filter_result = ecws.Smoothresults() filter_result.section = section_id filter_result.deleted_points = deleted_point_id_list filter_result.outlier_algo = "BoxplotOutlier" filter_result.filtering_algo = "SmoothZigzag" result_entry = ecwe.Entry.create_entry(user_id, "analysis/smoothing", filter_result) ts.insert(result_entry)
def testPointFilteringZigzag(self): classicJumpTrip1 = self.trip_entries[8] self.loadPointsForTrip(classicJumpTrip1.get_id()) classicJumpSections1 = [s for s in self.section_entries if s.data.trip_id == classicJumpTrip1.get_id()] outlier_algo = eaics.BoxplotOutlier() jump_algo = eaicj.SmoothZigzag(False, 100) for i, section_entry in enumerate(classicJumpSections1): logging.debug("-" * 20 + "Considering section %s" % i + "-" * 20) section_df = self.ts.get_data_df("background/filtered_location", esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section_entry.get_id())) with_speeds_df = eaicl.add_dist_heading_speed(section_df) maxSpeed = outlier_algo.get_threshold(with_speeds_df) logging.debug("Max speed for section %s = %s" % (i, maxSpeed)) jump_algo.filter(with_speeds_df) logging.debug("Retaining points %s" % np.nonzero(jump_algo.inlier_mask_)) to_delete_mask = np.logical_not(jump_algo.inlier_mask_) logging.debug("Deleting points %s" % np.nonzero(to_delete_mask)) delete_ids = list(with_speeds_df[to_delete_mask]._id) logging.debug("Deleting ids %s" % delete_ids) if i == 0: # this is the zigzag section self.assertEqual(np.nonzero(to_delete_mask)[0].tolist(), [25, 64, 114, 115, 116, 117, 118, 119, 120, 123, 126]) self.assertEqual(delete_ids, [boi.ObjectId('55edafe77d65cb39ee9882ff'), boi.ObjectId('55edcc157d65cb39ee98836e'), boi.ObjectId('55edcc1f7d65cb39ee988400'), boi.ObjectId('55edcc1f7d65cb39ee988403'), boi.ObjectId('55edcc1f7d65cb39ee988406'), boi.ObjectId('55edcc1f7d65cb39ee988409'), boi.ObjectId('55edcc1f7d65cb39ee98840c'), boi.ObjectId('55edcc207d65cb39ee988410'), boi.ObjectId('55edcc207d65cb39ee988412'), boi.ObjectId('55edcc217d65cb39ee98841f'), boi.ObjectId('55edcc217d65cb39ee988429')]) else: self.assertEqual(len(np.nonzero(to_delete_mask)[0]), 0) self.assertEqual(len(delete_ids), 0)
def get_filtered_points(section, filtered_section_data): logging.debug("Getting filtered points for section %s" % section) ts = esta.TimeSeries.get_time_series(section.user_id) loc_entry_it = ts.find_entries(["background/filtered_location"], esda.get_time_query_for_trip_like( esda.RAW_SECTION_KEY, section.get_id())) loc_entry_list = [ecwe.Entry(e) for e in loc_entry_it] # We know that the assertion fails in the geojson conversion code and we # handle it there, so we are just going to comment this out for now. # assert (loc_entry_list[-1].data.loc == section.data.end_loc, # "section_location_array[-1].loc != section.end_loc even after df.ts fix", # (loc_entry_list[-1].data.loc, section.data.end_loc)) # Find the list of points to filter filtered_points_entry_doc = ts.get_entry_at_ts("analysis/smoothing", "data.section", section.get_id()) if filtered_points_entry_doc is None: logging.debug("No filtered_points_entry, filtered_points_list is empty") filtered_point_id_list = [] else: # TODO: Figure out how to make collections work for the wrappers and then change this to an Entry filtered_points_entry = ad.AttrDict(filtered_points_entry_doc) filtered_point_id_list = list(filtered_points_entry.data.deleted_points) logging.debug("deleting %s points from section points" % len( filtered_point_id_list)) filtered_loc_list = remove_outliers(loc_entry_list, filtered_point_id_list) # filtered_loc_list has removed the outliers. Now, we resample the data at # 30 sec intervals resampled_loc_df = resample(filtered_loc_list, interval=30) # If this is the first section, we need to find the start place of the parent trip # and actually start from there. That will fix the distances but not the duration # because we haven't yet figured out how to get the correct start time. # TODO: Fix this!! # For now, we will fudge this in the geojson converter, as always with_speeds_df = eaicl.add_dist_heading_speed(resampled_loc_df) with_speeds_df["idx"] = np.arange(0, len(with_speeds_df)) with_speeds_df_nona = with_speeds_df.dropna() logging.info("removed %d entries containing n/a" % (len(with_speeds_df_nona) - len(with_speeds_df))) return with_speeds_df_nona
def testPointFilteringShanghaiJump(self): classicJumpTrip1 = self.trip_entries[0] self.loadPointsForTrip(classicJumpTrip1.get_id()) classicJumpSections1 = [ s for s in self.section_entries if s.data.trip_id == classicJumpTrip1.get_id() ] outlier_algo = eaics.BoxplotOutlier() jump_algo = eaicj.SmoothZigzag(False, 100) for i, section_entry in enumerate(classicJumpSections1): logging.debug("-" * 20 + "Considering section %s" % i + "-" * 20) section_df = self.ts.get_data_df( "background/filtered_location", esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section_entry.get_id())) with_speeds_df = eaicl.add_dist_heading_speed(section_df) maxSpeed = outlier_algo.get_threshold(with_speeds_df) logging.debug("Max speed for section %s = %s" % (i, maxSpeed)) jump_algo.filter(with_speeds_df) logging.debug("Retaining points %s" % np.nonzero(jump_algo.inlier_mask_.to_numpy())) to_delete_mask = np.logical_not(jump_algo.inlier_mask_) logging.debug("Deleting points %s" % np.nonzero(to_delete_mask.to_numpy())) delete_ids = list(with_speeds_df[to_delete_mask]._id) logging.debug("Deleting ids %s" % delete_ids) # Automated checks. Might be able to remove logging statements later if i != 2: # Not the bad section. Should not be filtered self.assertEqual(np.count_nonzero(to_delete_mask), 0) self.assertEqual(len(delete_ids), 0) else: # The bad section, should have the third point filtered self.assertEqual(np.count_nonzero(to_delete_mask), 1) self.assertEqual([str(id) for id in delete_ids], ["55d8c4837d65cb39ee983cb4"])
def filter_jumps(user_id, section_id): """ filters out any jumps in the points related to this section and stores a entry that lists the deleted points for this trip and this section. :param user_id: the user id to filter the trips for :param section_id: the section_id to filter the trips for :return: none. saves an entry with the filtered points into the database. """ logging.debug("filter_jumps(%s, %s) called" % (user_id, section_id)) outlier_algo = eaico.BoxplotOutlier() tq = esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section_id) ts = esta.TimeSeries.get_time_series(user_id) section_points_df = ts.get_data_df("background/filtered_location", tq) is_ios = section_points_df["filter"].dropna().unique().tolist() == ["distance"] if is_ios: logging.debug("Found iOS section, filling in gaps with fake data") section_points_df = _ios_fill_fake_data(section_points_df) filtering_algo = eaicj.SmoothZigzag(is_ios, DEFAULT_SAME_POINT_DISTANCE) logging.debug("len(section_points_df) = %s" % len(section_points_df)) points_to_ignore_df = get_points_to_filter(section_points_df, outlier_algo, filtering_algo) if points_to_ignore_df is None: # There were no points to delete return points_to_ignore_df_filtered = points_to_ignore_df._id.dropna() logging.debug("after filtering ignored points, %s -> %s" % (len(points_to_ignore_df), len(points_to_ignore_df_filtered))) # We shouldn't really filter any fuzzed points because they represent 100m in 60 secs # but let's actually check for that # assert len(points_to_ignore_df) == len(points_to_ignore_df_filtered) deleted_point_id_list = list(points_to_ignore_df_filtered) logging.debug("deleted %s points" % len(deleted_point_id_list)) filter_result = ecws.Smoothresults() filter_result.section = section_id filter_result.deleted_points = deleted_point_id_list filter_result.outlier_algo = "BoxplotOutlier" filter_result.filtering_algo = "SmoothZigzag" result_entry = ecwe.Entry.create_entry(user_id, "analysis/smoothing", filter_result) ts.insert(result_entry)
def testRemoveAllOutliers(self): etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2016-06-20") self.ts = esta.TimeSeries.get_time_series(self.testUUID) eaist.segment_current_trips(self.testUUID) eaiss.segment_current_sections(self.testUUID) eaicl.filter_current_sections(self.testUUID) # get all sections sections = [ecwe.Entry(s) for s in self.ts.find_entries([esda.RAW_SECTION_KEY], time_query=None)] for section in sections: filtered_points_entry_doc = self.ts.get_entry_at_ts("analysis/smoothing", "data.section", section.get_id()) if filtered_points_entry_doc is not None: logging.debug("Found smoothing result for section %s" % section.get_id()) # Setting the set of deleted points to everything loc_tq = esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section.get_id()) loc_df = self.ts.get_data_df("background/filtered_location", loc_tq) filtered_points_entry_doc["data"]["deleted_points"] = loc_df["_id"].tolist() self.ts.update(ecwe.Entry(filtered_points_entry_doc)) # All we care is that this should not crash. eaicr.clean_and_resample(self.testUUID) # Most of the trips have zero length, but apparently one has non-zero length # because the stop length is non zero!! # So there is only one cleaned trip left cleaned_trips_df = self.ts.get_data_df(esda.CLEANED_TRIP_KEY, time_query=None) self.assertEqual(len(cleaned_trips_df), 1) # We don't support squishing sections, but we only store stops and sections # for non-squished trips. And this non-squished trip happens to have # two sections and one stop cleaned_sections_df = self.ts.get_data_df(esda.CLEANED_SECTION_KEY, time_query=None) self.assertEqual(len(cleaned_sections_df), 2) self.assertEqual(cleaned_sections_df.distance.tolist(), [0,0]) cleaned_stops_df = self.ts.get_data_df(esda.CLEANED_STOP_KEY, time_query=None) self.assertEqual(len(cleaned_stops_df), 1) self.assertAlmostEqual(cleaned_stops_df.distance[0], 3252, places=0)
def testPointFilteringShanghaiJump(self): classicJumpTrip1 = self.trip_entries[0] self.loadPointsForTrip(classicJumpTrip1.get_id()) classicJumpSections1 = [s for s in self.section_entries if s.data.trip_id == classicJumpTrip1.get_id()] outlier_algo = eaics.BoxplotOutlier() jump_algo = eaicj.SmoothZigzag() for i, section_entry in enumerate(classicJumpSections1): logging.debug("-" * 20 + "Considering section %s" % i + "-" * 20) section_df = self.ts.get_data_df("background/filtered_location", esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section_entry.get_id())) with_speeds_df = eaicl.add_dist_heading_speed(section_df) maxSpeed = outlier_algo.get_threshold(with_speeds_df) logging.debug("Max speed for section %s = %s" % (i, maxSpeed)) jump_algo.filter(with_speeds_df) logging.debug("Retaining points %s" % np.nonzero(jump_algo.inlier_mask_)) to_delete_mask = np.logical_not(jump_algo.inlier_mask_) logging.debug("Deleting points %s" % np.nonzero(to_delete_mask)) delete_ids = list(with_speeds_df[to_delete_mask]._id) logging.debug("Deleting ids %s" % delete_ids) # Automated checks. Might be able to remove logging statements later if i != 2: # Not the bad section. Should not be filtered self.assertEqual(np.count_nonzero(to_delete_mask), 0) self.assertEqual(len(delete_ids), 0) else: # The bad section, should have the third point filtered self.assertEqual(np.count_nonzero(to_delete_mask), 1) self.assertEqual([str(id) for id in delete_ids], ["55d8c4837d65cb39ee983cb4"])
def segment_trip_into_sections(user_id, trip_entry, trip_source): ts = esta.TimeSeries.get_time_series(user_id) time_query = esda.get_time_query_for_trip_like(esda.RAW_TRIP_KEY, trip_entry.get_id()) distance_from_place = _get_distance_from_start_place_to_end(trip_entry) if (trip_source == "DwellSegmentationTimeFilter"): import emission.analysis.intake.segmentation.section_segmentation_methods.smoothed_high_confidence_motion as shcm shcmsm = shcm.SmoothedHighConfidenceMotion(60, 100, [ ecwm.MotionTypes.TILTING, ecwm.MotionTypes.UNKNOWN, ecwm.MotionTypes.STILL ]) else: assert (trip_source == "DwellSegmentationDistFilter") import emission.analysis.intake.segmentation.section_segmentation_methods.smoothed_high_confidence_with_visit_transitions as shcmvt shcmsm = shcmvt.SmoothedHighConfidenceMotionWithVisitTransitions( 49, 50, [ ecwm.MotionTypes.TILTING, ecwm.MotionTypes.UNKNOWN, ecwm.MotionTypes.STILL, ecwm.MotionTypes.NONE, # iOS only ecwm.MotionTypes.STOPPED_WHILE_IN_VEHICLE ]) # iOS only segmentation_points = shcmsm.segment_into_sections(ts, distance_from_place, time_query) # Since we are segmenting an existing trip into sections, we do not need to worry about linking with # a prior place, since it will be linked through the trip object. # So this is much simpler than the trip case. # Again, since this is segmenting a trip, we can just start with a section prev_section_entry = None # TODO: Should we link the locations to the trips this way, or by using a foreign key? # If we want to use a foreign key, then we need to include the object id in the data df as well so that we can # set it properly. ts = esta.TimeSeries.get_time_series(user_id) get_loc_for_ts = lambda time: ecwl.Location( ts.get_entry_at_ts("background/filtered_location", "data.ts", time)[ "data"]) trip_start_loc = get_loc_for_ts(trip_entry.data.start_ts) trip_end_loc = get_loc_for_ts(trip_entry.data.end_ts) logging.debug("trip_start_loc = %s, trip_end_loc = %s" % (trip_start_loc, trip_end_loc)) for (i, (start_loc_doc, end_loc_doc, sensed_mode)) in enumerate(segmentation_points): logging.debug("start_loc_doc = %s, end_loc_doc = %s" % (start_loc_doc, end_loc_doc)) get_loc_for_row = lambda row: ts.df_row_to_entry( "background/filtered_location", row).data start_loc = get_loc_for_row(start_loc_doc) end_loc = get_loc_for_row(end_loc_doc) logging.debug("start_loc = %s, end_loc = %s" % (start_loc, end_loc)) section = ecwc.Section() section.trip_id = trip_entry.get_id() if prev_section_entry is None: # This is the first point, so we want to start from the start of the trip, not the start of this segment start_loc = trip_start_loc if i == len(segmentation_points) - 1: # This is the last point, so we want to end at the end of the trip, not at the end of this segment # Particularly in this case, if we don't do this, then the trip end may overshoot the section end end_loc = trip_end_loc fill_section(section, start_loc, end_loc, sensed_mode) # We create the entry after filling in the section so that we know # that the data is included properly section_entry = ecwe.Entry.create_entry(user_id, esda.RAW_SECTION_KEY, section, create_id=True) if prev_section_entry is not None: # If this is not the first section, create a stop to link the two sections together # The expectation is prev_section -> stop -> curr_section stop = ecws.Stop() stop.trip_id = trip_entry.get_id() stop_entry = ecwe.Entry.create_entry(user_id, esda.RAW_STOP_KEY, stop, create_id=True) logging.debug("stop = %s, stop_entry = %s" % (stop, stop_entry)) stitch_together(prev_section_entry, stop_entry, section_entry) ts.insert(stop_entry) ts.update(prev_section_entry) # After we go through the loop, we will be left with the last section, # which does not have an ending stop. We insert that too. ts.insert(section_entry) prev_section_entry = section_entry
def segment_trip_into_sections(user_id, trip_id, trip_source): ts = esta.TimeSeries.get_time_series(user_id) trip = esda.get_object(esda.RAW_TRIP_KEY, trip_id) time_query = esda.get_time_query_for_trip_like(esda.RAW_TRIP_KEY, trip_id) if (trip_source == "DwellSegmentationTimeFilter"): import emission.analysis.intake.segmentation.section_segmentation_methods.smoothed_high_confidence_motion as shcm shcmsm = shcm.SmoothedHighConfidenceMotion(60, [ecwm.MotionTypes.TILTING, ecwm.MotionTypes.UNKNOWN, ecwm.MotionTypes.STILL]) else: assert(trip_source == "DwellSegmentationDistFilter") import emission.analysis.intake.segmentation.section_segmentation_methods.smoothed_high_confidence_with_visit_transitions as shcmvt shcmsm = shcmvt.SmoothedHighConfidenceMotionWithVisitTransitions( 49, [ecwm.MotionTypes.TILTING, ecwm.MotionTypes.UNKNOWN, ecwm.MotionTypes.STILL, ecwm.MotionTypes.NONE, # iOS only ecwm.MotionTypes.STOPPED_WHILE_IN_VEHICLE]) # iOS only segmentation_points = shcmsm.segment_into_sections(ts, time_query) # Since we are segmenting an existing trip into sections, we do not need to worry about linking with # a prior place, since it will be linked through the trip object. # So this is much simpler than the trip case. # Again, since this is segmenting a trip, we can just start with a section prev_section_entry = None # TODO: Should we link the locations to the trips this way, or by using a foreign key? # If we want to use a foreign key, then we need to include the object id in the data df as well so that we can # set it properly. ts = esta.TimeSeries.get_time_series(user_id) get_loc_for_ts = lambda time: ecwl.Location(ts.get_entry_at_ts("background/filtered_location", "data.ts", time)["data"]) trip_start_loc = get_loc_for_ts(trip.start_ts) trip_end_loc = get_loc_for_ts(trip.end_ts) logging.debug("trip_start_loc = %s, trip_end_loc = %s" % (trip_start_loc, trip_end_loc)) for (i, (start_loc_doc, end_loc_doc, sensed_mode)) in enumerate(segmentation_points): logging.debug("start_loc_doc = %s, end_loc_doc = %s" % (start_loc_doc, end_loc_doc)) get_loc_for_row = lambda row: ts.df_row_to_entry("background/filtered_location", row).data start_loc = get_loc_for_row(start_loc_doc) end_loc = get_loc_for_row(end_loc_doc) logging.debug("start_loc = %s, end_loc = %s" % (start_loc, end_loc)) section = ecwc.Section() section.trip_id = trip_id if prev_section_entry is None: # This is the first point, so we want to start from the start of the trip, not the start of this segment start_loc = trip_start_loc if i == len(segmentation_points) - 1: # This is the last point, so we want to end at the end of the trip, not at the end of this segment # Particularly in this case, if we don't do this, then the trip end may overshoot the section end end_loc = trip_end_loc fill_section(section, start_loc, end_loc, sensed_mode) # We create the entry after filling in the section so that we know # that the data is included properly section_entry = ecwe.Entry.create_entry(user_id, esda.RAW_SECTION_KEY, section, create_id=True) if prev_section_entry is not None: # If this is not the first section, create a stop to link the two sections together # The expectation is prev_section -> stop -> curr_section stop = ecws.Stop() stop.trip_id = trip_id stop_entry = ecwe.Entry.create_entry(user_id, esda.RAW_STOP_KEY, stop, create_id=True) logging.debug("stop = %s, stop_entry = %s" % (stop, stop_entry)) stitch_together(prev_section_entry, stop_entry, section_entry) ts.insert(stop_entry) ts.update(prev_section_entry) # After we go through the loop, we will be left with the last section, # which does not have an ending stop. We insert that too. ts.insert(section_entry) prev_section_entry = section_entry
def section_to_geojson(section, tl): """ This is the trickiest part of the visualization. The section is basically a collection of points with a line through them. So the representation is a feature in which one feature which is the line, and one feature collection which is the set of point features. :param section: the section to be converted :return: a feature collection which is the geojson version of the section """ ts = esta.TimeSeries.get_time_series(section.user_id) entry_it = ts.find_entries(["analysis/recreated_location"], esda.get_time_query_for_trip_like( "analysis/cleaned_section", section.get_id())) # TODO: Decide whether we want to use Rewrite to use dataframes throughout instead of python arrays. # dataframes insert nans. We could use fillna to fill with default values, but if we are not actually # using dataframe features here, it is unclear how much that would help. feature_array = [] section_location_entries = [ecwe.Entry(entry) for entry in entry_it] if len(section_location_entries) != 0: logging.debug("first element in section_location_array = %s" % section_location_entries[0]) # Fudge the end point so that we don't have a gap because of the ts != write_ts mismatch # TODO: Fix this once we are able to query by the data timestamp instead of the metadata ts assert section_location_entries[-1].data.loc == section.data.end_loc, \ "section_location_array[-1].data.loc %s != section.data.end_loc %s even after df.ts fix" % \ (section_location_entries[-1].data.loc, section.data.end_loc) # last_loc_doc = ts.get_entry_at_ts("background/filtered_location", "data.ts", section.end_ts) # last_loc_data = ecwe.Entry(last_loc_doc).data # last_loc_data["_id"] = last_loc_doc["_id"] # section_location_array.append(last_loc_data) # logging.debug("Adding new entry %s to fill the end point gap between %s and %s" # % (last_loc_data.loc, section_location_array[-2].loc, section.end_loc)) # points_feature_array = [location_to_geojson(l) for l in filtered_section_location_array] points_line_feature = point_array_to_line(section_location_entries) # If this is the first section, we already start from the trip start. But we actually need to start from the # prior place. Fudge this too. Note also that we may want to figure out how to handle this properly in the model # without needing fudging. TODO: Unclear how exactly to do this if section.data.start_stop is None: # This is the first section. So we need to find the start place of the parent trip parent_trip = tl.get_object(section.data.trip_id) start_place_of_parent_trip = tl.get_object( parent_trip.data.start_place) points_line_feature.geometry.coordinates.insert( 0, start_place_of_parent_trip.data.location.coordinates) points_line_feature.id = str(section.get_id()) points_line_feature.properties = copy.copy(section.data) points_line_feature.properties["feature_type"] = "section" points_line_feature.properties["sensed_mode"] = str( points_line_feature.properties.sensed_mode) _del_non_derializable(points_line_feature.properties, ["start_loc", "end_loc"]) # feature_array.append(gj.FeatureCollection(points_feature_array)) feature_array.append(points_line_feature) return gj.FeatureCollection(feature_array)
def testGetTimeRangeForTrip(self): new_trip = self.create_fake_trip() ret_tq = esda.get_time_query_for_trip_like(esda.RAW_TRIP_KEY, new_trip.get_id()) self.assertEqual(ret_tq.timeType, "data.ts") self.assertEqual(ret_tq.startTs, 5) self.assertEqual(ret_tq.endTs, 6)