def testLocalRangeRolloverQuery(self): """ Search for all entries between 8:18 and 8:20 local time, both inclusive """ start_local_dt = ecwl.LocalDate({ 'year': 2015, 'month': 8, 'hour': 8, 'minute': 18 }) end_local_dt = ecwl.LocalDate({ 'year': 2015, 'month': 8, 'hour': 9, 'minute': 8 }) final_query = {"user_id": self.testUUID} final_query.update( esdl.get_range_query("data.local_dt", start_local_dt, end_local_dt)) entries = edb.get_timeseries_db().find(final_query).sort( 'data.ts', pymongo.ASCENDING) self.assertEqual(448, edb.get_timeseries_db().count_documents(final_query)) entries_list = list(entries) # Note that since this is a set of filters, as opposed to a range, this # returns all entries between 18 and 8 in both hours. # so 8:18 is valid, but so is 9:57 self.assertEqual(ecwe.Entry(entries_list[0]).data.local_dt.hour, 8) self.assertEqual(ecwe.Entry(entries_list[0]).data.local_dt.minute, 18) self.assertEqual(ecwe.Entry(entries_list[-1]).data.local_dt.hour, 9) self.assertEqual(ecwe.Entry(entries_list[-1]).data.local_dt.minute, 57)
def testJul22SplitAroundReboot(self): dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook) etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) # Although we process the day's data in two batches, we should get the same result self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) # Although we process the day's data in two batches, we should get the same result self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data)
def testZeroDurationPlaceInterpolationMultiSync(self): # Test for 545114feb5ac15caac4110d39935612525954b71 dataFile_1 = "emission/tests/data/real_examples/shankari_2016-01-12" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-01-13" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 1, 'day': 12}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 1, 'day': 13}) cacheKey_1 = "diary/trips-2016-01-12" cacheKey_2 = "diary/trips-2016-01-13" ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook) etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) # Although we process the day's data in two batches, we should get the same result self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) # Although we process the day's data in two batches, we should get the same result self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data)
def testCountLocalDateMetrics(self): met_result = metrics.summarize_by_local_date(self.testUUID, ecwl.LocalDate({'year': 2015, 'month': 8}), ecwl.LocalDate({'year': 2015, 'month': 9}), 'MONTHLY', ['count'], True) self.assertEqual(met_result.keys(), ['aggregate_metrics', 'user_metrics']) user_met_result = met_result['user_metrics'][0] agg_met_result = met_result['aggregate_metrics'][0] logging.debug(met_result) # local timezone means that we only have one entry self.assertEqual(len(user_met_result), 1) self.assertEqual(user_met_result[0].nUsers, 1) self.assertEqual(user_met_result[0].ON_FOOT, 6) self.assertEqual(user_met_result[0].BICYCLING, 4) self.assertEqual(user_met_result[0].IN_VEHICLE, 5) # We are not going to make assertions about the aggregate values since # they are affected by other entries in the database but we expect them # to be at least as much as the user values self.assertEqual(len(agg_met_result), 1) self.assertEqual(agg_met_result[0].nUsers, 2) self.assertGreaterEqual(agg_met_result[0].BICYCLING, user_met_result[0].BICYCLING + 1) # 21s has one bike trip self.assertGreaterEqual(agg_met_result[0].ON_FOOT, user_met_result[0].ON_FOOT + 3) # 21s has three bike trips self.assertGreaterEqual(agg_met_result[0].IN_VEHICLE, user_met_result[0].IN_VEHICLE + 3) # 21s has three motorized trips
def testResetToPast(self): """ - Load data for both days - Run pipelines - Verify that all is well - Reset to a date before both - Verify that analysis data for the both days is removed - Re-run pipelines - Verify that all is well """ # Load all data dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook) # Run both pipelines etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) # Verify that all is well api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data) # Reset to a date well before the two days reset_ts = arrow.get("2015-07-24").timestamp epr.reset_user_to_ts(self.testUUID, reset_ts, is_dry_run=False) # Data should be completely deleted api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.assertEqual(api_result, []) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.assertEqual(api_result, []) # Re-running the pipeline again etc.runIntakePipeline(self.testUUID) # Should reconstruct everything api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data)
def testResetToStart(self): """ - Load data for both days - Run pipelines - Verify that all is well - Reset to start - Verify that there is no analysis data - Re-run pipelines - Verify that all is well """ # Load all data dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook) # Run both pipelines etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) # Check results: so far, so good api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data) # Reset pipeline to start epr.reset_user_to_start(self.testUUID, is_dry_run=False) # Now there are no results api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.assertEqual(api_result, []) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.assertEqual(api_result, []) # Re-run the pipeline again etc.runIntakePipeline(self.testUUID) # Should be back to ground truth api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data)
def setUp(self): etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-aug-27") eaicf.filter_accuracy(self.testUUID) estfm.move_all_filters_to_data() logging.info("After loading, timeseries db size = %s" % edb.get_timeseries_db().count()) self.day_start_ts = 1440658800 self.day_end_ts = 1440745200 self.day_start_dt = ecwl.LocalDate({'year': 2015, 'month': 8, 'day': 27}) self.day_end_dt = ecwl.LocalDate({'year': 2015, 'month': 8, 'day': 27})
def testAug10MultiSyncEndNotDetected(self): # Re-run, but with multiple calls to sync data # This tests the effect of online versus offline analysis and segmentation with potentially partial data dataFile = "emission/tests/data/real_examples/shankari_2016-08-10" start_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 9}) end_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 10}) cacheKey = "diary/trips-2016-08-10" with open( "emission/tests/data/real_examples/shankari_2016-08-910.ground_truth" ) as gtf: ground_truth = json.load(gtf, object_hook=bju.object_hook) logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count()) with open(dataFile) as df: all_entries = json.load(df, object_hook=bju.object_hook) ts_1030 = arrow.get("2016-08-10T10:30:00-07:00").timestamp logging.debug("ts_1030 = %s, converted back = %s" % (ts_1030, arrow.get(ts_1030).to("America/Los_Angeles"))) before_1030_entries = [ e for e in all_entries if ad.AttrDict(e).metadata.write_ts <= ts_1030 ] after_1030_entries = [ e for e in all_entries if ad.AttrDict(e).metadata.write_ts > ts_1030 ] # First load all data from the 9th. Otherwise, the missed trip is the first trip, # and we don't set the last_ts_processed # See the code around "logging.debug("len(segmentation_points) == 0, early return")" etc.setupRealExample( self, "emission/tests/data/real_examples/shankari_2016-08-09") # Sync at 10:30 to capture all the points on the trip *to* the optometrist # Skip the last few points to ensure that the trip end is skipped self.entries = before_1030_entries[0:-2] etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Then sync after 10:30 self.entries = after_1030_entries etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) self.persistGroundTruthIfNeeded(api_result, dataFile, start_ld, cacheKey) # Although we process the day's data in two batches, we should get the same result self.compare_approx_result(ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth).data, time_fuzz=60, distance_fuzz=100)
def testIsMatchedUser(self): # Load data for the Bay Area dataFileba = "emission/tests/data/real_examples/shankari_2016-06-20" ldba = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 20}) etc.setupRealExample(self, dataFileba) testUUIDba = self.testUUID etc.runIntakePipeline(testUUIDba) logging.debug("uuid for the bay area = %s " % testUUIDba) # Load data for Hawaii dataFilehi = "emission/tests/data/real_examples/shankari_2016-07-27" ldhi = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 27}) etc.setupRealExample(self, dataFilehi) testUUIDhi = self.testUUID etc.runIntakePipeline(testUUIDhi) logging.debug("uuid for hawaii = %s " % testUUIDhi) self.testUUIDList = [testUUIDba, testUUIDhi] air_query_spec = { "time_type": "local_date", "from_local_date": { "year": 2016, "month": 2 }, "to_local_date": { "year": 2016, "month": 9 }, "freq": 'DAILY', "checks": [{ "modes": ['WALKING', 'ON_FOOT'], "metric": "count", "threshold": { "$gt": 5 } }, { "modes": ['AIR_OR_HSR'], "metric": "count", "threshold": { "$gt": 1 } }] } # Since this requires at least one air trip, this will only return the # hawaii trip self.assertTrue(tripmetrics.is_matched_user(testUUIDhi, air_query_spec)) self.assertFalse( tripmetrics.is_matched_user(testUUIDba, air_query_spec))
def testLocalRangeStandardQuery(self): """ Search for all entries between 8:18 and 8:20 local time, both inclusive """ start_local_dt = ecwl.LocalDate({'year': 2015, 'month': 8, 'hour': 8, 'minute': 18}) end_local_dt = ecwl.LocalDate({'year': 2015, 'month': 8, 'hour': 8, 'minute': 20}) final_query = {"user_id": self.testUUID} final_query.update(esdl.get_range_query("data.local_dt", start_local_dt, end_local_dt)) entries = edb.get_timeseries_db().find(final_query) self.assertEquals(15, entries.count())
def get_time_query(year, month): if year is None and month is None: return None if month is None: assert year is not None query_ld = ecwl.LocalDate({"year": year}) else: assert year is not None and month is not None query_ld = ecwl.LocalDate({"year": year, "month": month}) tq = esttc.TimeComponentQuery("data.start_local_dt", query_ld, query_ld) return tq
def testOct07MultiSyncSpuriousEndDetected(self): # Re-run, but with multiple calls to sync data # This tests the effect of online versus offline analysis and segmentation with potentially partial data dataFile = "emission/tests/data/real_examples/issue_436_assertion_error" start_ld = ecwl.LocalDate({'year': 2016, 'month': 10, 'day': 0o7}) end_ld = ecwl.LocalDate({'year': 2016, 'month': 10, 'day': 0o7}) cacheKey = "diary/trips-2016-10-07" with open(dataFile + ".ground_truth") as gtf: ground_truth = json.load(gtf, object_hook=bju.object_hook) logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count()) with open(dataFile) as df: all_entries = json.load(df, object_hook=bju.object_hook) # 18:01 because the transition was at 2016-02-22T18:00:09.623404-08:00, so right after # 18:00 ts_1800 = arrow.get("2016-10-07T18:33:11-07:00").timestamp logging.debug("ts_1800 = %s, converted back = %s" % (ts_1800, arrow.get(ts_1800).to("America/Los_Angeles"))) before_1800_entries = [ e for e in all_entries if ad.AttrDict(e).metadata.write_ts <= ts_1800 ] after_1800_entries = [ e for e in all_entries if ad.AttrDict(e).metadata.write_ts > ts_1800 ] # Sync at 18:00 to capture all the points on the trip *to* the optometrist # Skip the last few points to ensure that the trip end is skipped etc.createAndFillUUID(self) self.entries = before_1800_entries etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Then sync after 18:00 self.entries = after_1800_entries etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) self.persistGroundTruthIfNeeded(api_result, dataFile, start_ld, cacheKey) # Although we process the day's data in two batches, we should get the same result self.compare_approx_result(ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth).data, time_fuzz=60, distance_fuzz=100)
def testGroupedByOneLocalDayMultiUTCDay(self): key = (2016, 5, 3) test_section_list = [] test_section_list.append( self._createTestSection(arrow.Arrow(2016,5,3,6, tzinfo=tz.gettz(PST)), PST)) test_section_list.append( self._createTestSection(arrow.Arrow(2016,5,3,10, tzinfo=tz.gettz(PST)), PST)) test_section_list.append( self._createTestSection(arrow.Arrow(2016,5,3,23, tzinfo=tz.gettz(PST)), PST)) self._fillModeDistanceDuration(test_section_list) logging.debug("durations = %s" % [s.data.duration for s in test_section_list]) # There's only one local date, so it will be consistent with # results in testGroupedByOneLocalDayOneUTCDay summary_ld_dict = earmt.group_by_local_date(self.testUUID, ecwl.LocalDate({'year': 2016, 'month': 5}), ecwl.LocalDate({'year': 2016, 'month': 6}), earmt.LocalFreq.DAILY, [earmts.get_count]) summary_ld = summary_ld_dict["result"][0] summary_ld_last = summary_ld_dict["last_ts_processed"] self.assertEqual(summary_ld_last, arrow.Arrow(2016,5,3,23, tzinfo=tz.gettz(PST)).timestamp) self.assertEqual(len(summary_ld), 1) # spans one day self.assertEqual(summary_ld[0].BICYCLING, 3) self.assertEqual(summary_ld[0].ts, 1462258800) self.assertEqual(summary_ld[0].local_dt.day, 3) summary_ts_dict = earmt.group_by_timestamp(self.testUUID, arrow.Arrow(2016,5,1).timestamp, arrow.Arrow(2016,6,1).timestamp, 'd', [earmts.get_count]) summary_ts = summary_ts_dict["result"][0] summary_ts_last = summary_ts_dict["last_ts_processed"] # But 23:00 PDT is 6am on the 4th in UTC, # so the results are different for this self.assertEqual(summary_ts_last, arrow.Arrow(2016,5,3,23, tzinfo=tz.gettz(PST)).timestamp) self.assertEqual(len(summary_ts), 2) # spans two days in UTC self.assertEqual(summary_ts[0].BICYCLING, 2) # 2 trips on the first day self.assertEqual(summary_ts[1].BICYCLING, 1) # 1 trips on the second day self.assertEqual(summary_ts[0].local_dt.day, 3) # because it is the second in UTC self.assertEqual(summary_ts[1].local_dt.day, 4) # because it is the second in UTC self.assertEqual(summary_ts[0].ts, 1462233600) # timestamp for midnight 3nd May self.assertEqual(summary_ts[1].ts, 1462320000) # timestamp for midnight 4rd May
def testGroupedByOneLocalDayOneUTCDay(self): key = (2016, 5, 3) test_section_list = [] # # Since PST is UTC-7, all of these will be in the same UTC day # 13:00, 17:00, 21:00 # so we expect the local date and UTC bins to be the same test_section_list.append( self._createTestSection(arrow.Arrow(2016,5,3,6, tzinfo=tz.gettz(PST)), PST)) test_section_list.append( self._createTestSection(arrow.Arrow(2016,5,3,10, tzinfo=tz.gettz(PST)), PST)) test_section_list.append( self._createTestSection(arrow.Arrow(2016,5,3,14, tzinfo=tz.gettz(PST)), PST)) self._fillModeDistanceDuration(test_section_list) logging.debug("durations = %s" % [s.data.duration for s in test_section_list]) summary_ts_dict = earmt.group_by_timestamp(self.testUUID, arrow.Arrow(2016,5,1).timestamp, arrow.Arrow(2016,6,1).timestamp, 'd', [earmts.get_count]) summary_ld_dict = earmt.group_by_local_date(self.testUUID, ecwl.LocalDate({'year': 2016, 'month': 5}), ecwl.LocalDate({'year': 2016, 'month': 6}), earmt.LocalFreq.DAILY, [earmts.get_count]) summary_ts_last = summary_ts_dict["last_ts_processed"] summary_ld_last = summary_ld_dict["last_ts_processed"] summary_ts = summary_ts_dict["result"][0] summary_ld = summary_ld_dict["result"][0] self.assertEqual(summary_ts_last, arrow.Arrow(2016,5,3,14, tzinfo=tz.gettz(PST)).timestamp) self.assertEqual(summary_ld_last, arrow.Arrow(2016,5,3,14, tzinfo=tz.gettz(PST)).timestamp) self.assertEqual(len(summary_ts), len(summary_ld)) # local date and UTC results are the same self.assertEqual(len(summary_ts), 1) # spans one day self.assertEqual(summary_ts[0].BICYCLING, summary_ld[0].BICYCLING) self.assertEqual(summary_ts[0].BICYCLING, 3) # Note that the timestamps are not guaranteed to be equal since # the UTC range starts at midnight UTC while the local time range # starts at midnight PDT # self.assertEqual(summary_ts[0].ts, summary_ld[0].ts) self.assertEqual(summary_ts[0].ts, 1462233600) self.assertEqual(summary_ld[0].ts, 1462258800) self.assertEqual(summary_ts[0].local_dt.day, 3) self.assertEqual(summary_ts[0].local_dt.day, summary_ld[0].local_dt.day)
def testFeb22MultiSyncEndNotDetected(self): # Re-run, but with multiple calls to sync data # This tests the effect of online versus offline analysis and segmentation with potentially partial data dataFile = "emission/tests/data/real_examples/iphone_2016-02-22" start_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22}) end_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22}) cacheKey = "diary/trips-2016-02-22" ground_truth = json.load(open(dataFile + ".ground_truth"), object_hook=bju.object_hook) logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().count()) all_entries = json.load(open(dataFile), object_hook=bju.object_hook) # 18:01 because the transition was at 2016-02-22T18:00:09.623404-08:00, so right after # 18:00 ts_1800 = arrow.get("2016-02-22T18:00:30-08:00").timestamp logging.debug("ts_1800 = %s, converted back = %s" % (ts_1800, arrow.get(ts_1800).to("America/Los_Angeles"))) before_1800_entries = [ e for e in all_entries if ad.AttrDict(e).metadata.write_ts <= ts_1800 ] after_1800_entries = [ e for e in all_entries if ad.AttrDict(e).metadata.write_ts > ts_1800 ] # Sync at 18:00 to capture all the points on the trip *to* the optometrist # Skip the last few points to ensure that the trip end is skipped import uuid self.testUUID = uuid.uuid4() self.entries = before_1800_entries[0:-2] etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Then sync after 18:00 self.entries = after_1800_entries etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Although we process the day's data in two batches, we should get the same result self.compare_approx_result(ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth).data, time_fuzz=60, distance_fuzz=100)
def testFeb22ShortTripsDistance(self): dataFile = "emission/tests/data/real_examples/iphone_3_2016-02-22" start_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22}) end_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22}) cacheKey = "diary/trips-2016-02-22" ground_truth = json.load(open(dataFile+".ground_truth"), object_hook=bju.object_hook) etc.setupRealExample(self, dataFile) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Although we process the day's data in two batches, we should get the same result self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth).data)
def testResetToFuture(self): """ - Load data for both days - Run pipelines - Reset to a date after the two - Verify that all is well - Re-run pipelines and ensure that there are no errors """ # Load all data dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1 + ".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2 + ".ground_truth"), object_hook=bju.object_hook) # Run both pipelines etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook=bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) # Reset to a date well after the two days reset_ts = arrow.get("2017-07-24").timestamp epr.reset_user_to_ts(self.testUUID, reset_ts, is_dry_run=False) # Data should be untouched because of early return api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_2).data) # Re-running the pipeline again should not affect anything etc.runIntakePipeline(self.testUUID)
def testAug27TooMuchExtrapolation(self): dataFile = "emission/tests/data/real_examples/shankari_2015-aug-27" start_ld = ecwl.LocalDate({'year': 2015, 'month': 8, 'day': 27}) end_ld = ecwl.LocalDate({'year': 2015, 'month': 8, 'day': 27}) cacheKey = "diary/trips-2015-08-27" ground_truth = json.load(open(dataFile+".ground_truth"), object_hook=bju.object_hook) etc.setupRealExample(self, dataFile) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Although we process the day's data in two batches, we should get the same result self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth).data)
def testLocalMatchingQuery(self): """ Search for all entries that occur at minute = 8 from any hour """ start_local_dt = ecwl.LocalDate({'minute': 8}) end_local_dt = ecwl.LocalDate({'minute': 8}) final_query = {"user_id": self.testUUID} final_query.update(esdl.get_range_query("data.local_dt", start_local_dt, end_local_dt)) entries_docs = edb.get_timeseries_db().find(final_query).sort("metadata.write_ts") self.assertEquals(20, entries_docs.count()) entries = [ecwe.Entry(doc) for doc in entries_docs] logging.debug("entries bookends are %s and %s" % (entries[0], entries[-1])) first_entry = entries[0] self.assertEquals(first_entry.data.local_dt.hour, 9) last_entry = entries[19] self.assertEquals(last_entry.data.local_dt.hour, 17)
def get_trips_for_day(user_uuid, day, force_refresh): """ The day argument here is a string such as 2015-10-01 or 2016-01-01. We will parse this to a datetime, which we will use to query the data in the timeseries. We could also cache the timeline views in a separate collection and just look up from there. The challenge is to then decide when to recompute a view - we can't use the standard technique that we use for the other stages because we will have to recompute the timeline for the current day multiple times, for example. """ # I was going to read from the user cache if it existed there, and recreate # from scratch if it didn't. But that would involve adding a getDocument # field to the usercache, which I had intentionally not added before this. # The problem with adding a getDocument method is that then the usercache # is no longer a cache - it is "storage" that is used internally. If we # want to do that, we should really store it as a materialized view and not # only in the usercache, which should be a cache of values stored elsewhere. parsed_dt = dup.parse(day) start_dt = ecwl.LocalDate({ 'year': parsed_dt.year, 'month': parsed_dt.month, 'day': parsed_dt.day }) end_dt = start_dt return gfc.get_geojson_for_dt(user_uuid, start_dt, end_dt)
def testAug11(self): # This is a more complex day. Tests: # PR #352 (should not split trip to Oakland) # PR #348 (trip from station to OAK DOT) # PR #357 (trip to Radio Shack is complete and not truncated) # PR #345 (no cleaned trips are skipped) dataFile = "emission/tests/data/real_examples/shankari_2016-08-11" ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 11}) cacheKey = "diary/trips-2016-08-11" with open(dataFile + ".ground_truth") as gfp: ground_truth = json.load(gfp, object_hook=bju.object_hook) etc.setupRealExample(self, dataFile) etc.runIntakePipeline(self.testUUID) # runIntakePipeline does not run the common trips, habitica or store views to cache # So let's manually store to the cache # tc_query = estt.TimeComponentQuery("data.star_local_dt", ld, ld) # enuah.UserCacheHandler.getUserCacheHandler(self.testUUID).storeTimelineToCache(tc_query) # cached_result = edb.get_usercache_db().find_one({'user_id': self.testUUID, # "metadata.key": cacheKey}) api_result = gfc.get_geojson_for_dt(self.testUUID, ld, ld) # self.compare_result(cached_result, ground_truth) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth).data)
def testJun20(self): # This is a fairly straightforward day. Tests mainly: # - ordering of trips # - handling repeated location entries with different write timestamps # We have two identical location points with ts = 1466436483.395 and write_ts = 1466436496.4, 1466436497.047 dataFile = "emission/tests/data/real_examples/shankari_2016-06-20" ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 20}) cacheKey = "diary/trips-2016-06-20" with open(dataFile + ".ground_truth") as gfp: ground_truth = json.load(gfp, object_hook=bju.object_hook) etc.setupRealExample(self, dataFile) etc.runIntakePipeline(self.testUUID) # runIntakePipeline does not run the common trips, habitica or store views to cache # So let's manually store to the cache # tc_query = estt.TimeComponentQuery("data.star_local_dt", ld, ld) # enuah.UserCacheHandler.getUserCacheHandler(self.testUUID).storeTimelineToCache(tc_query) # cached_result = edb.get_usercache_db().find_one({'user_id': self.testUUID, # "metadata.key": cacheKey}) api_result = gfc.get_geojson_for_dt(self.testUUID, ld, ld) # self.compare_result(cached_result, ground_truth) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth).data)
def testJun21(self): # This is a more complex day. Tests: # PR #357 (spurious trip at 14:00 should be segmented and skipped) # PR #358 (trip back from bella's house at 16:00) dataFile = "emission/tests/data/real_examples/shankari_2016-06-21" ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 21}) cacheKey = "diary/trips-2016-06-21" with open(dataFile + ".ground_truth") as gfp: ground_truth = json.load(gfp, object_hook=bju.object_hook) etc.setupRealExample(self, dataFile) etc.runIntakePipeline(self.testUUID) # runIntakePipeline does not run the common trips, habitica or store views to cache # So let's manually store to the cache # tc_query = estt.TimeComponentQuery("data.star_local_dt", ld, ld) # enuah.UserCacheHandler.getUserCacheHandler(self.testUUID).storeTimelineToCache(tc_query) # cached_result = edb.get_usercache_db().find_one({'user_id': self.testUUID, # "metadata.key": cacheKey}) api_result = gfc.get_geojson_for_dt(self.testUUID, ld, ld) # self.compare_result(cached_result, ground_truth) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth).data)
def testAug10(self): # This is a more complex day. Tests: # PR #302 (trip to optometrist) # PR #352 (split optometrist trip) dataFile = "emission/tests/data/real_examples/shankari_2016-08-10" ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 10}) cacheKey = "diary/trips-2016-08-10" ground_truth = json.load(open(dataFile + ".ground_truth"), object_hook=bju.object_hook) etc.setupRealExample(self, dataFile) etc.runIntakePipeline(self.testUUID) # runIntakePipeline does not run the common trips, habitica or store views to cache # So let's manually store to the cache # tc_query = estt.TimeComponentQuery("data.star_local_dt", ld, ld) # enuah.UserCacheHandler.getUserCacheHandler(self.testUUID).storeTimelineToCache(tc_query) # cached_result = edb.get_usercache_db().find_one({'user_id': self.testUUID, # "metadata.key": cacheKey}) api_result = gfc.get_geojson_for_dt(self.testUUID, ld, ld) # self.compare_result(cached_result, ground_truth) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth).data)
def local_dt_fill_times_yearly(key, section_group_df, metric_summary): first_tz = _get_tz(section_group_df) ld = ecwl.LocalDate({'year': key, 'timezone': first_tz}) dt = arrow.Arrow(ld.year, 1, 1, tzinfo=first_tz).floor('year') metric_summary.ts = dt.timestamp metric_summary.local_dt = ld metric_summary.fmt_time = dt.format("YYYY")
def testJun20Postload(self): # Same as testJun20Preload, except that the user input arrives after the # pipeline is run for the first time, and the matching happens on the # next pipeline run dataFile = "emission/tests/data/real_examples/shankari_2016-06-20" ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 20}) self.checkConfirmedTripsAndSections(dataFile, ld, preload=False)
def testLocalDate(self): import emission.core.wrapper.localdate as ecwl test_local = TestWrapper({'a': 1, 'c': 3}) test_local.write_local_dt = ecwl.LocalDate({'year': 2016, 'month': 4}) self.assertEqual(test_local.write_local_dt.year, 2016) self.assertEqual(test_local.write_local_dt.month, 4) with self.assertRaisesRegexp(AttributeError, ".*has no attribute.*"): print("the value of day is %s" % test_local.write_local_dt.day)
def testAug10(self): # This is a more complex day. Tests: # PR #302 (trip to optometrist) # PR #352 (split optometrist trip) dataFile = "emission/tests/data/real_examples/shankari_2016-08-10" ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 10}) cacheKey = "diary/trips-2016-08-10" self.standardMatchDataGroundTruth(dataFile, ld, cacheKey)
def testJun21(self): # This is a more complex day. Tests: # PR #357 (spurious trip at 14:00 should be segmented and skipped) # PR #358 (trip back from bella's house at 16:00) dataFile = "emission/tests/data/real_examples/shankari_2016-06-21" ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 21}) cacheKey = "diary/trips-2016-06-21" self.standardMatchDataGroundTruth(dataFile, ld, cacheKey)
def testJun20(self): # This is a fairly straightforward day. Tests mainly: # - ordering of trips # - handling repeated location entries with different write timestamps # We have two identical location points with ts = 1466436483.395 and write_ts = 1466436496.4, 1466436497.047 dataFile = "emission/tests/data/real_examples/shankari_2016-06-20" ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 20}) cacheKey = "diary/trips-2016-06-20" self.standardMatchDataGroundTruth(dataFile, ld, cacheKey)