def filter_accuracy(user_id): time_query = epq.get_time_range_for_accuracy_filtering(user_id) timeseries = esta.TimeSeries.get_time_series(user_id) try: unfiltered_points_df = timeseries.get_data_df("background/location", time_query) filtered_from_unfiltered_df = unfiltered_points_df[unfiltered_points_df.accuracy < 200] logging.info("filtered %d of %d points" % (len(filtered_from_unfiltered_df), len(unfiltered_points_df))) if len(unfiltered_points_df) == 0: epq.mark_accuracy_filtering_done(user_id, None) else: for idx, entry in filtered_from_unfiltered_df.iterrows(): # First, we check to see if this is a duplicate of an existing entry. # If so, we will skip it since it is probably generated as a duplicate... if check_prior_duplicate(filtered_from_unfiltered_df, idx, entry): logging.info("Found duplicate entry at index %s, id = %s, lat = %s, lng = %s, skipping" % (idx, entry._id, entry.latitude, entry.longitude)) continue # Next, we check to see if there is an existing "background/filtered_location" point that corresponds # to this point. If there is, then we don't want to re-insert. This ensures that this step is idempotent if check_existing_filtered_location(timeseries, entry): logging.info("Found existing filtered location for entry at index = %s, id = %s, ts = %s, fmt_time = %s, skipping" % (idx, entry._id, entry.ts, entry.fmt_time)) continue # logging.debug("Inserting %s filtered entry %s into timeseries" % (idx, entry)) entry_copy = convert_to_filtered(timeseries.get_entry_at_ts( "background/location", "metadata.write_ts", entry.metadata_write_ts)) timeseries.insert(entry_copy) last_entry_processed = unfiltered_points_df.iloc[-1].metadata_write_ts epq.mark_accuracy_filtering_done(user_id, last_entry_processed) except: logging.exception("Marking accuracy filtering as failed") epq.mark_accuracy_filtering_failed(user_id)
def testEmptyCallToPriorDuplicate(self): time_query = epq.get_time_range_for_accuracy_filtering(self.testUUID) unfiltered_points_df = self.ts.get_data_df("background/location", time_query) self.assertEqual(len(unfiltered_points_df), 205) # Check call to check duplicate with a zero length dataframe entry = unfiltered_points_df.iloc[5] self.assertEqual(eaicf.check_prior_duplicate(pd.DataFrame(), 0, entry), False)
def testEmptyCallToPriorDuplicate(self): time_query = epq.get_time_range_for_accuracy_filtering(self.testUUID) unfiltered_points_df = self.ts.get_data_df("background/location", time_query) self.assertEqual(len(unfiltered_points_df), 205) # Check call to check duplicate with a zero length dataframe entry = unfiltered_points_df.iloc[5] self.assertEqual(eaicf.check_prior_duplicate(pd.DataFrame(), 0, entry), False)
def testConvertToFiltered(self): time_query = epq.get_time_range_for_accuracy_filtering(self.testUUID) unfiltered_points_df = self.ts.get_data_df("background/location", time_query) self.assertEqual(len(unfiltered_points_df), 205) entry_from_df = unfiltered_points_df.iloc[5] entry_copy = eaicf.convert_to_filtered(self.ts.get_entry_at_ts("background/location", "metadata.write_ts", entry_from_df.metadata_write_ts)) self.assertNotIn("_id", entry_copy) self.assertEquals(entry_copy["metadata"]["key"], "background/filtered_location")
def testCheckPriorDuplicate(self): time_query = epq.get_time_range_for_accuracy_filtering(self.testUUID) unfiltered_points_df = self.ts.get_data_df("background/location", time_query) self.assertEqual(len(unfiltered_points_df), 205) entry = unfiltered_points_df.iloc[5] unfiltered_appended_df = pd.DataFrame([entry] * 5).append(unfiltered_points_df).reset_index() logging.debug("unfiltered_appended_df = %s" % unfiltered_appended_df[["fmt_time"]].head()) self.assertEqual(eaicf.check_prior_duplicate(unfiltered_appended_df, 5, entry), True) self.assertEqual(eaicf.check_prior_duplicate(unfiltered_points_df, 5, entry), False)
def testExistingFilteredLocation(self): time_query = epq.get_time_range_for_accuracy_filtering(self.testUUID) unfiltered_points_df = self.ts.get_data_df("background/location", time_query) self.assertEqual(len(unfiltered_points_df), 205) entry_from_df = unfiltered_points_df.iloc[5] self.assertEqual(eaicf.check_existing_filtered_location(self.ts, entry_from_df), False) entry_copy = self.ts.get_entry_at_ts("background/location", "metadata.write_ts", entry_from_df.metadata_write_ts) self.ts.insert(eaicf.convert_to_filtered(entry_copy)) self.assertEqual(eaicf.check_existing_filtered_location(self.ts, entry_from_df), True)
def testConvertToFiltered(self): time_query = epq.get_time_range_for_accuracy_filtering(self.testUUID) unfiltered_points_df = self.ts.get_data_df("background/location", time_query) self.assertEqual(len(unfiltered_points_df), 205) entry_from_df = unfiltered_points_df.iloc[5] entry_copy = eaicf.convert_to_filtered( self.ts.get_entry_at_ts("background/location", "metadata.write_ts", entry_from_df.metadata_write_ts)) self.assertNotIn("_id", entry_copy) self.assertEquals(entry_copy["metadata"]["key"], "background/filtered_location")
def filter_accuracy(user_id): time_query = epq.get_time_range_for_accuracy_filtering(user_id) timeseries = esta.TimeSeries.get_time_series(user_id) if not continuous_collection_in_range(timeseries): logging.debug( "Not a public phone, must already have filtered data, early return" ) epq.mark_accuracy_filtering_done(user_id, None) return try: unfiltered_points_df = timeseries.get_data_df("background/location", time_query) if len(unfiltered_points_df) == 0: epq.mark_accuracy_filtering_done(user_id, None) else: filtered_from_unfiltered_df = unfiltered_points_df[ unfiltered_points_df.accuracy < 200] logging.info( "filtered %d of %d points" % (len(filtered_from_unfiltered_df), len(unfiltered_points_df))) for idx, entry in filtered_from_unfiltered_df.iterrows(): # First, we check to see if this is a duplicate of an existing entry. # If so, we will skip it since it is probably generated as a duplicate... if check_prior_duplicate(filtered_from_unfiltered_df, idx, entry): logging.info( "Found duplicate entry at index %s, id = %s, lat = %s, lng = %s, skipping" % (idx, entry._id, entry.latitude, entry.longitude)) continue # Next, we check to see if there is an existing "background/filtered_location" point that corresponds # to this point. If there is, then we don't want to re-insert. This ensures that this step is idempotent if check_existing_filtered_location(timeseries, entry): logging.info( "Found existing filtered location for entry at index = %s, id = %s, ts = %s, fmt_time = %s, skipping" % (idx, entry._id, entry.ts, entry.fmt_time)) continue # logging.debug("Inserting %s filtered entry %s into timeseries" % (idx, entry)) entry_copy = convert_to_filtered( timeseries.get_entry_at_ts("background/location", "metadata.write_ts", entry.metadata_write_ts)) timeseries.insert(entry_copy) last_entry_processed = unfiltered_points_df.iloc[ -1].metadata_write_ts epq.mark_accuracy_filtering_done(user_id, last_entry_processed) except: logging.exception("Marking accuracy filtering as failed") epq.mark_accuracy_filtering_failed(user_id)
def testExistingFilteredLocation(self): time_query = epq.get_time_range_for_accuracy_filtering(self.testUUID) unfiltered_points_df = self.ts.get_data_df("background/location", time_query) self.assertEqual(len(unfiltered_points_df), 205) entry_from_df = unfiltered_points_df.iloc[5] self.assertEqual( eaicf.check_existing_filtered_location(self.ts, entry_from_df), False) entry_copy = self.ts.get_entry_at_ts("background/location", "metadata.write_ts", entry_from_df.metadata_write_ts) self.ts.insert(eaicf.convert_to_filtered(entry_copy)) self.assertEqual( eaicf.check_existing_filtered_location(self.ts, entry_from_df), True)
def testCheckPriorDuplicate(self): time_query = epq.get_time_range_for_accuracy_filtering(self.testUUID) unfiltered_points_df = self.ts.get_data_df("background/location", time_query) self.assertEqual(len(unfiltered_points_df), 205) entry = unfiltered_points_df.iloc[5] unfiltered_appended_df = pd.DataFrame( [entry] * 5).append(unfiltered_points_df).reset_index() logging.debug("unfiltered_appended_df = %s" % unfiltered_appended_df[["fmt_time"]].head()) self.assertEqual( eaicf.check_prior_duplicate(unfiltered_appended_df, 0, entry), False) self.assertEqual( eaicf.check_prior_duplicate(unfiltered_appended_df, 5, entry), True) self.assertEqual( eaicf.check_prior_duplicate(unfiltered_points_df, 5, entry), False)