def testLocalRangeRolloverQuery(self):
        """
        Search for all entries between 8:18 and 8:20 local time, both inclusive
        """
        start_local_dt = ecwl.LocalDate({
            'year': 2015,
            'month': 8,
            'hour': 8,
            'minute': 18
        })
        end_local_dt = ecwl.LocalDate({
            'year': 2015,
            'month': 8,
            'hour': 9,
            'minute': 8
        })
        final_query = {"user_id": self.testUUID}
        final_query.update(
            esdl.get_range_query("data.local_dt", start_local_dt,
                                 end_local_dt))
        entries = edb.get_timeseries_db().find(final_query).sort(
            'data.ts', pymongo.ASCENDING)
        self.assertEqual(448,
                         edb.get_timeseries_db().count_documents(final_query))

        entries_list = list(entries)

        # Note that since this is a set of filters, as opposed to a range, this
        # returns all entries between 18 and 8 in both hours.
        # so 8:18 is valid, but so is 9:57
        self.assertEqual(ecwe.Entry(entries_list[0]).data.local_dt.hour, 8)
        self.assertEqual(ecwe.Entry(entries_list[0]).data.local_dt.minute, 18)
        self.assertEqual(ecwe.Entry(entries_list[-1]).data.local_dt.hour, 9)
        self.assertEqual(ecwe.Entry(entries_list[-1]).data.local_dt.minute, 57)
    def testJul22SplitAroundReboot(self):
        dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22"
        dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25"
        start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22})
        start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25})
        cacheKey_1 = "diary/trips-2016-07-22"
        cacheKey_2 = "diary/trips-2016-07-25"
        ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook)
        ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook)

        etc.setupRealExample(self, dataFile_1)
        etc.runIntakePipeline(self.testUUID)
        self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook)
        etc.setupRealExampleWithEntries(self)
        etc.runIntakePipeline(self.testUUID)

        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1)
        # Although we process the day's data in two batches, we should get the same result
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                            ad.AttrDict(ground_truth_1).data)

        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2)
        # Although we process the day's data in two batches, we should get the same result
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                            ad.AttrDict(ground_truth_2).data)
    def testZeroDurationPlaceInterpolationMultiSync(self):
        # Test for 545114feb5ac15caac4110d39935612525954b71
        dataFile_1 = "emission/tests/data/real_examples/shankari_2016-01-12"
        dataFile_2 = "emission/tests/data/real_examples/shankari_2016-01-13"
        start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 1, 'day': 12})
        start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 1, 'day': 13})
        cacheKey_1 = "diary/trips-2016-01-12"
        cacheKey_2 = "diary/trips-2016-01-13"
        ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook)
        ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook)

        etc.setupRealExample(self, dataFile_1)
        etc.runIntakePipeline(self.testUUID)
        self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook)
        etc.setupRealExampleWithEntries(self)
        etc.runIntakePipeline(self.testUUID)

        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1)
        # Although we process the day's data in two batches, we should get the same result
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                            ad.AttrDict(ground_truth_1).data)

        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2)
        # Although we process the day's data in two batches, we should get the same result
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                            ad.AttrDict(ground_truth_2).data)
Esempio n. 4
0
    def testCountLocalDateMetrics(self):
        met_result = metrics.summarize_by_local_date(self.testUUID,
                                                     ecwl.LocalDate({'year': 2015, 'month': 8}),
                                                     ecwl.LocalDate({'year': 2015, 'month': 9}),
                                                     'MONTHLY', ['count'], True)
        self.assertEqual(met_result.keys(), ['aggregate_metrics', 'user_metrics'])
        user_met_result = met_result['user_metrics'][0]
        agg_met_result = met_result['aggregate_metrics'][0]

        logging.debug(met_result)

        # local timezone means that we only have one entry
        self.assertEqual(len(user_met_result), 1)
        self.assertEqual(user_met_result[0].nUsers, 1)
        self.assertEqual(user_met_result[0].ON_FOOT, 6)
        self.assertEqual(user_met_result[0].BICYCLING, 4)
        self.assertEqual(user_met_result[0].IN_VEHICLE, 5)
        # We are not going to make assertions about the aggregate values since
        # they are affected by other entries in the database but we expect them
        # to be at least as much as the user values
        self.assertEqual(len(agg_met_result), 1)
        self.assertEqual(agg_met_result[0].nUsers, 2)
        self.assertGreaterEqual(agg_met_result[0].BICYCLING,
                                user_met_result[0].BICYCLING + 1) # 21s has one bike trip
        self.assertGreaterEqual(agg_met_result[0].ON_FOOT,
                                user_met_result[0].ON_FOOT + 3) # 21s has three bike trips
        self.assertGreaterEqual(agg_met_result[0].IN_VEHICLE,
                                user_met_result[0].IN_VEHICLE + 3) # 21s has three motorized trips
    def testResetToPast(self):
        """
        - Load data for both days
        - Run pipelines
        - Verify that all is well
        - Reset to a date before both
        - Verify that analysis data for the both days is removed
        - Re-run pipelines
        - Verify that all is well
        """
        # Load all data
        dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22"
        dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25"
        start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22})
        start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25})
        cacheKey_1 = "diary/trips-2016-07-22"
        cacheKey_2 = "diary/trips-2016-07-25"
        ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook)
        ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook)

        # Run both pipelines
        etc.setupRealExample(self, dataFile_1)
        etc.runIntakePipeline(self.testUUID)
        self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook)
        etc.setupRealExampleWithEntries(self)
        etc.runIntakePipeline(self.testUUID)

        # Verify that all is well
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1)
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                            ad.AttrDict(ground_truth_1).data)

        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2)
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                            ad.AttrDict(ground_truth_2).data)

        # Reset to a date well before the two days
        reset_ts = arrow.get("2015-07-24").timestamp
        epr.reset_user_to_ts(self.testUUID, reset_ts, is_dry_run=False)

        # Data should be completely deleted
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1)
        self.assertEqual(api_result, [])

        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2)
        self.assertEqual(api_result, [])

        # Re-running the pipeline again
        etc.runIntakePipeline(self.testUUID)
        
        # Should reconstruct everything
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1)
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                            ad.AttrDict(ground_truth_1).data)

        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2)
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                            ad.AttrDict(ground_truth_2).data)
    def testResetToStart(self):
        """
        - Load data for both days
        - Run pipelines
        - Verify that all is well
        - Reset to start
        - Verify that there is no analysis data
        - Re-run pipelines
        - Verify that all is well
        """

        # Load all data
        dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22"
        dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25"
        start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22})
        start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25})
        cacheKey_1 = "diary/trips-2016-07-22"
        cacheKey_2 = "diary/trips-2016-07-25"
        ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook)
        ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook)

        # Run both pipelines
        etc.setupRealExample(self, dataFile_1)
        etc.runIntakePipeline(self.testUUID)
        self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook)
        etc.setupRealExampleWithEntries(self)
        etc.runIntakePipeline(self.testUUID)

        # Check results: so far, so good
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1)
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                            ad.AttrDict(ground_truth_1).data)

        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2)
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                            ad.AttrDict(ground_truth_2).data)

        # Reset pipeline to start
        epr.reset_user_to_start(self.testUUID, is_dry_run=False)

        # Now there are no results
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1)
        self.assertEqual(api_result, [])

        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2)
        self.assertEqual(api_result, [])

        # Re-run the pipeline again
        etc.runIntakePipeline(self.testUUID)

        # Should be back to ground truth
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1)
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                            ad.AttrDict(ground_truth_1).data)

        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2)
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                            ad.AttrDict(ground_truth_2).data)
Esempio n. 7
0
 def setUp(self):
     etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-aug-27")
     eaicf.filter_accuracy(self.testUUID)
     estfm.move_all_filters_to_data()        
     logging.info("After loading, timeseries db size = %s" % edb.get_timeseries_db().count())
     self.day_start_ts = 1440658800
     self.day_end_ts = 1440745200
     self.day_start_dt = ecwl.LocalDate({'year': 2015, 'month': 8, 'day': 27})
     self.day_end_dt = ecwl.LocalDate({'year': 2015, 'month': 8, 'day': 27})
    def testAug10MultiSyncEndNotDetected(self):
        # Re-run, but with multiple calls to sync data
        # This tests the effect of online versus offline analysis and segmentation with potentially partial data

        dataFile = "emission/tests/data/real_examples/shankari_2016-08-10"
        start_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 9})
        end_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 10})
        cacheKey = "diary/trips-2016-08-10"
        with open(
                "emission/tests/data/real_examples/shankari_2016-08-910.ground_truth"
        ) as gtf:
            ground_truth = json.load(gtf, object_hook=bju.object_hook)

        logging.info("Before loading, timeseries db size = %s" %
                     edb.get_timeseries_db().estimated_document_count())
        with open(dataFile) as df:
            all_entries = json.load(df, object_hook=bju.object_hook)
        ts_1030 = arrow.get("2016-08-10T10:30:00-07:00").timestamp
        logging.debug("ts_1030 = %s, converted back = %s" %
                      (ts_1030, arrow.get(ts_1030).to("America/Los_Angeles")))
        before_1030_entries = [
            e for e in all_entries
            if ad.AttrDict(e).metadata.write_ts <= ts_1030
        ]
        after_1030_entries = [
            e for e in all_entries
            if ad.AttrDict(e).metadata.write_ts > ts_1030
        ]

        # First load all data from the 9th. Otherwise, the missed trip is the first trip,
        # and we don't set the last_ts_processed
        # See the code around "logging.debug("len(segmentation_points) == 0, early return")"
        etc.setupRealExample(
            self, "emission/tests/data/real_examples/shankari_2016-08-09")

        # Sync at 10:30 to capture all the points on the trip *to* the optometrist
        # Skip the last few points to ensure that the trip end is skipped
        self.entries = before_1030_entries[0:-2]
        etc.setupRealExampleWithEntries(self)
        etc.runIntakePipeline(self.testUUID)
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld)

        # Then sync after 10:30
        self.entries = after_1030_entries
        etc.setupRealExampleWithEntries(self)
        etc.runIntakePipeline(self.testUUID)
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld)
        self.persistGroundTruthIfNeeded(api_result, dataFile, start_ld,
                                        cacheKey)

        # Although we process the day's data in two batches, we should get the same result
        self.compare_approx_result(ad.AttrDict({
            'result': api_result
        }).result,
                                   ad.AttrDict(ground_truth).data,
                                   time_fuzz=60,
                                   distance_fuzz=100)
    def testIsMatchedUser(self):
        # Load data for the Bay Area
        dataFileba = "emission/tests/data/real_examples/shankari_2016-06-20"
        ldba = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 20})

        etc.setupRealExample(self, dataFileba)
        testUUIDba = self.testUUID
        etc.runIntakePipeline(testUUIDba)
        logging.debug("uuid for the bay area = %s " % testUUIDba)

        # Load data for Hawaii
        dataFilehi = "emission/tests/data/real_examples/shankari_2016-07-27"
        ldhi = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 27})

        etc.setupRealExample(self, dataFilehi)
        testUUIDhi = self.testUUID
        etc.runIntakePipeline(testUUIDhi)

        logging.debug("uuid for hawaii = %s " % testUUIDhi)

        self.testUUIDList = [testUUIDba, testUUIDhi]

        air_query_spec = {
            "time_type":
            "local_date",
            "from_local_date": {
                "year": 2016,
                "month": 2
            },
            "to_local_date": {
                "year": 2016,
                "month": 9
            },
            "freq":
            'DAILY',
            "checks": [{
                "modes": ['WALKING', 'ON_FOOT'],
                "metric": "count",
                "threshold": {
                    "$gt": 5
                }
            }, {
                "modes": ['AIR_OR_HSR'],
                "metric": "count",
                "threshold": {
                    "$gt": 1
                }
            }]
        }

        # Since this requires at least one air trip, this will only return the
        # hawaii trip

        self.assertTrue(tripmetrics.is_matched_user(testUUIDhi,
                                                    air_query_spec))
        self.assertFalse(
            tripmetrics.is_matched_user(testUUIDba, air_query_spec))
 def testLocalRangeStandardQuery(self):
     """
     Search for all entries between 8:18 and 8:20 local time, both inclusive
     """
     start_local_dt = ecwl.LocalDate({'year': 2015, 'month': 8, 'hour': 8, 'minute': 18})
     end_local_dt = ecwl.LocalDate({'year': 2015, 'month': 8, 'hour': 8, 'minute': 20})
     final_query = {"user_id": self.testUUID}
     final_query.update(esdl.get_range_query("data.local_dt", start_local_dt, end_local_dt))
     entries = edb.get_timeseries_db().find(final_query)
     self.assertEquals(15, entries.count())
Esempio n. 11
0
def get_time_query(year, month):
    if year is None and month is None:
        return None

    if month is None:
        assert year is not None
        query_ld = ecwl.LocalDate({"year": year})
    else:
        assert year is not None and month is not None
        query_ld = ecwl.LocalDate({"year": year, "month": month})
    tq = esttc.TimeComponentQuery("data.start_local_dt", query_ld, query_ld)
    return tq
    def testOct07MultiSyncSpuriousEndDetected(self):
        # Re-run, but with multiple calls to sync data
        # This tests the effect of online versus offline analysis and segmentation with potentially partial data

        dataFile = "emission/tests/data/real_examples/issue_436_assertion_error"
        start_ld = ecwl.LocalDate({'year': 2016, 'month': 10, 'day': 0o7})
        end_ld = ecwl.LocalDate({'year': 2016, 'month': 10, 'day': 0o7})
        cacheKey = "diary/trips-2016-10-07"
        with open(dataFile + ".ground_truth") as gtf:
            ground_truth = json.load(gtf, object_hook=bju.object_hook)

        logging.info("Before loading, timeseries db size = %s" %
                     edb.get_timeseries_db().estimated_document_count())
        with open(dataFile) as df:
            all_entries = json.load(df, object_hook=bju.object_hook)
        # 18:01 because the transition was at 2016-02-22T18:00:09.623404-08:00, so right after
        # 18:00
        ts_1800 = arrow.get("2016-10-07T18:33:11-07:00").timestamp
        logging.debug("ts_1800 = %s, converted back = %s" %
                      (ts_1800, arrow.get(ts_1800).to("America/Los_Angeles")))
        before_1800_entries = [
            e for e in all_entries
            if ad.AttrDict(e).metadata.write_ts <= ts_1800
        ]
        after_1800_entries = [
            e for e in all_entries
            if ad.AttrDict(e).metadata.write_ts > ts_1800
        ]

        # Sync at 18:00 to capture all the points on the trip *to* the optometrist
        # Skip the last few points to ensure that the trip end is skipped
        etc.createAndFillUUID(self)
        self.entries = before_1800_entries
        etc.setupRealExampleWithEntries(self)
        etc.runIntakePipeline(self.testUUID)
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld)

        # Then sync after 18:00
        self.entries = after_1800_entries
        etc.setupRealExampleWithEntries(self)
        etc.runIntakePipeline(self.testUUID)
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld)
        self.persistGroundTruthIfNeeded(api_result, dataFile, start_ld,
                                        cacheKey)

        # Although we process the day's data in two batches, we should get the same result
        self.compare_approx_result(ad.AttrDict({
            'result': api_result
        }).result,
                                   ad.AttrDict(ground_truth).data,
                                   time_fuzz=60,
                                   distance_fuzz=100)
Esempio n. 13
0
    def testGroupedByOneLocalDayMultiUTCDay(self):
        key = (2016, 5, 3)
        test_section_list = []

        test_section_list.append(
            self._createTestSection(arrow.Arrow(2016,5,3,6, tzinfo=tz.gettz(PST)),
                                    PST))
        test_section_list.append(
            self._createTestSection(arrow.Arrow(2016,5,3,10, tzinfo=tz.gettz(PST)),
                                    PST))
        test_section_list.append(
            self._createTestSection(arrow.Arrow(2016,5,3,23, tzinfo=tz.gettz(PST)),
                                    PST))

        self._fillModeDistanceDuration(test_section_list)
        logging.debug("durations = %s" %
                      [s.data.duration for s in test_section_list])

        # There's only one local date, so it will be consistent with
        # results in testGroupedByOneLocalDayOneUTCDay
        summary_ld_dict = earmt.group_by_local_date(self.testUUID,
                                               ecwl.LocalDate({'year': 2016, 'month': 5}),
                                               ecwl.LocalDate({'year': 2016, 'month': 6}),
                                               earmt.LocalFreq.DAILY, [earmts.get_count])

        summary_ld = summary_ld_dict["result"][0]
        summary_ld_last = summary_ld_dict["last_ts_processed"]
        self.assertEqual(summary_ld_last,
                         arrow.Arrow(2016,5,3,23, tzinfo=tz.gettz(PST)).timestamp)
        self.assertEqual(len(summary_ld), 1) # spans one day
        self.assertEqual(summary_ld[0].BICYCLING, 3)
        self.assertEqual(summary_ld[0].ts, 1462258800)
        self.assertEqual(summary_ld[0].local_dt.day, 3)

        summary_ts_dict = earmt.group_by_timestamp(self.testUUID,
                                           arrow.Arrow(2016,5,1).timestamp,
                                           arrow.Arrow(2016,6,1).timestamp,
                                           'd', [earmts.get_count])
        summary_ts = summary_ts_dict["result"][0]
        summary_ts_last = summary_ts_dict["last_ts_processed"]

        # But 23:00 PDT is 6am on the 4th in UTC,
        # so the results are different for this
        self.assertEqual(summary_ts_last,
                         arrow.Arrow(2016,5,3,23, tzinfo=tz.gettz(PST)).timestamp)
        self.assertEqual(len(summary_ts), 2) # spans two days in UTC
        self.assertEqual(summary_ts[0].BICYCLING, 2) # 2 trips on the first day
        self.assertEqual(summary_ts[1].BICYCLING, 1) # 1 trips on the second day
        self.assertEqual(summary_ts[0].local_dt.day, 3) # because it is the second in UTC
        self.assertEqual(summary_ts[1].local_dt.day, 4) # because it is the second in UTC
        self.assertEqual(summary_ts[0].ts, 1462233600) # timestamp for midnight 3nd May
        self.assertEqual(summary_ts[1].ts, 1462320000) # timestamp for midnight 4rd May
Esempio n. 14
0
    def testGroupedByOneLocalDayOneUTCDay(self):
        key = (2016, 5, 3)
        test_section_list = []
        #
        # Since PST is UTC-7, all of these will be in the same UTC day
        # 13:00, 17:00, 21:00
        # so we expect the local date and UTC bins to be the same
        test_section_list.append(
            self._createTestSection(arrow.Arrow(2016,5,3,6, tzinfo=tz.gettz(PST)),
                                    PST))
        test_section_list.append(
            self._createTestSection(arrow.Arrow(2016,5,3,10, tzinfo=tz.gettz(PST)),
                                    PST))
        test_section_list.append(
            self._createTestSection(arrow.Arrow(2016,5,3,14, tzinfo=tz.gettz(PST)),
                                    PST))

        self._fillModeDistanceDuration(test_section_list)
        logging.debug("durations = %s" %
                      [s.data.duration for s in test_section_list])

        summary_ts_dict = earmt.group_by_timestamp(self.testUUID,
                                           arrow.Arrow(2016,5,1).timestamp,
                                           arrow.Arrow(2016,6,1).timestamp,
                                           'd', [earmts.get_count])
        summary_ld_dict = earmt.group_by_local_date(self.testUUID,
                                               ecwl.LocalDate({'year': 2016, 'month': 5}),
                                               ecwl.LocalDate({'year': 2016, 'month': 6}),
                                               earmt.LocalFreq.DAILY, [earmts.get_count])

        summary_ts_last = summary_ts_dict["last_ts_processed"]
        summary_ld_last = summary_ld_dict["last_ts_processed"]

        summary_ts = summary_ts_dict["result"][0]
        summary_ld = summary_ld_dict["result"][0]

        self.assertEqual(summary_ts_last, arrow.Arrow(2016,5,3,14, tzinfo=tz.gettz(PST)).timestamp)
        self.assertEqual(summary_ld_last, arrow.Arrow(2016,5,3,14, tzinfo=tz.gettz(PST)).timestamp)

        self.assertEqual(len(summary_ts), len(summary_ld)) # local date and UTC results are the same
        self.assertEqual(len(summary_ts), 1) # spans one day
        self.assertEqual(summary_ts[0].BICYCLING, summary_ld[0].BICYCLING)
        self.assertEqual(summary_ts[0].BICYCLING, 3)
        # Note that the timestamps are not guaranteed to be equal since
        # the UTC range starts at midnight UTC while the local time range
        # starts at midnight PDT
        # self.assertEqual(summary_ts[0].ts, summary_ld[0].ts)
        self.assertEqual(summary_ts[0].ts, 1462233600)
        self.assertEqual(summary_ld[0].ts, 1462258800)
        self.assertEqual(summary_ts[0].local_dt.day, 3)
        self.assertEqual(summary_ts[0].local_dt.day, summary_ld[0].local_dt.day)
    def testFeb22MultiSyncEndNotDetected(self):
        # Re-run, but with multiple calls to sync data
        # This tests the effect of online versus offline analysis and segmentation with potentially partial data

        dataFile = "emission/tests/data/real_examples/iphone_2016-02-22"
        start_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22})
        end_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22})
        cacheKey = "diary/trips-2016-02-22"
        ground_truth = json.load(open(dataFile + ".ground_truth"),
                                 object_hook=bju.object_hook)

        logging.info("Before loading, timeseries db size = %s" %
                     edb.get_timeseries_db().count())
        all_entries = json.load(open(dataFile), object_hook=bju.object_hook)
        # 18:01 because the transition was at 2016-02-22T18:00:09.623404-08:00, so right after
        # 18:00
        ts_1800 = arrow.get("2016-02-22T18:00:30-08:00").timestamp
        logging.debug("ts_1800 = %s, converted back = %s" %
                      (ts_1800, arrow.get(ts_1800).to("America/Los_Angeles")))
        before_1800_entries = [
            e for e in all_entries
            if ad.AttrDict(e).metadata.write_ts <= ts_1800
        ]
        after_1800_entries = [
            e for e in all_entries
            if ad.AttrDict(e).metadata.write_ts > ts_1800
        ]

        # Sync at 18:00 to capture all the points on the trip *to* the optometrist
        # Skip the last few points to ensure that the trip end is skipped
        import uuid
        self.testUUID = uuid.uuid4()
        self.entries = before_1800_entries[0:-2]
        etc.setupRealExampleWithEntries(self)
        etc.runIntakePipeline(self.testUUID)
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld)

        # Then sync after 18:00
        self.entries = after_1800_entries
        etc.setupRealExampleWithEntries(self)
        etc.runIntakePipeline(self.testUUID)
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld)

        # Although we process the day's data in two batches, we should get the same result
        self.compare_approx_result(ad.AttrDict({
            'result': api_result
        }).result,
                                   ad.AttrDict(ground_truth).data,
                                   time_fuzz=60,
                                   distance_fuzz=100)
    def testFeb22ShortTripsDistance(self):
        dataFile = "emission/tests/data/real_examples/iphone_3_2016-02-22"
        start_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22})
        end_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22})
        cacheKey = "diary/trips-2016-02-22"
        ground_truth = json.load(open(dataFile+".ground_truth"), object_hook=bju.object_hook)

        etc.setupRealExample(self, dataFile)
        etc.runIntakePipeline(self.testUUID)
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld)

        # Although we process the day's data in two batches, we should get the same result
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                                   ad.AttrDict(ground_truth).data)
Esempio n. 17
0
    def testResetToFuture(self):
        """
        - Load data for both days
        - Run pipelines
        - Reset to a date after the two
        - Verify that all is well
        - Re-run pipelines and ensure that there are no errors
        """
        # Load all data
        dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22"
        dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25"
        start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22})
        start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25})
        cacheKey_1 = "diary/trips-2016-07-22"
        cacheKey_2 = "diary/trips-2016-07-25"
        ground_truth_1 = json.load(open(dataFile_1 + ".ground_truth"),
                                   object_hook=bju.object_hook)
        ground_truth_2 = json.load(open(dataFile_2 + ".ground_truth"),
                                   object_hook=bju.object_hook)

        # Run both pipelines
        etc.setupRealExample(self, dataFile_1)
        etc.runIntakePipeline(self.testUUID)
        self.entries = json.load(open(dataFile_2), object_hook=bju.object_hook)
        etc.setupRealExampleWithEntries(self)
        etc.runIntakePipeline(self.testUUID)

        # Reset to a date well after the two days
        reset_ts = arrow.get("2017-07-24").timestamp
        epr.reset_user_to_ts(self.testUUID, reset_ts, is_dry_run=False)

        # Data should be untouched because of early return
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1,
                                            start_ld_1)
        self.compare_result(
            ad.AttrDict({
                'result': api_result
            }).result,
            ad.AttrDict(ground_truth_1).data)

        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2,
                                            start_ld_2)
        self.compare_result(
            ad.AttrDict({
                'result': api_result
            }).result,
            ad.AttrDict(ground_truth_2).data)

        # Re-running the pipeline again should not affect anything
        etc.runIntakePipeline(self.testUUID)
    def testAug27TooMuchExtrapolation(self):
        dataFile = "emission/tests/data/real_examples/shankari_2015-aug-27"
        start_ld = ecwl.LocalDate({'year': 2015, 'month': 8, 'day': 27})
        end_ld = ecwl.LocalDate({'year': 2015, 'month': 8, 'day': 27})
        cacheKey = "diary/trips-2015-08-27"
        ground_truth = json.load(open(dataFile+".ground_truth"), object_hook=bju.object_hook)

        etc.setupRealExample(self, dataFile)
        etc.runIntakePipeline(self.testUUID)
        api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld)

        # Although we process the day's data in two batches, we should get the same result
        self.compare_result(ad.AttrDict({'result': api_result}).result,
                            ad.AttrDict(ground_truth).data)
 def testLocalMatchingQuery(self):
     """
     Search for all entries that occur at minute = 8 from any hour
     """
     start_local_dt = ecwl.LocalDate({'minute': 8})
     end_local_dt = ecwl.LocalDate({'minute': 8})
     final_query = {"user_id": self.testUUID}
     final_query.update(esdl.get_range_query("data.local_dt", start_local_dt, end_local_dt))
     entries_docs = edb.get_timeseries_db().find(final_query).sort("metadata.write_ts")
     self.assertEquals(20, entries_docs.count())
     entries = [ecwe.Entry(doc) for doc in entries_docs]
     logging.debug("entries bookends are %s and %s" % (entries[0], entries[-1]))
     first_entry = entries[0]
     self.assertEquals(first_entry.data.local_dt.hour, 9)
     last_entry = entries[19]
     self.assertEquals(last_entry.data.local_dt.hour, 17)
Esempio n. 20
0
def get_trips_for_day(user_uuid, day, force_refresh):
    """
    The day argument here is a string such as 2015-10-01 or 2016-01-01. We will
    parse this to a datetime, which we will use to query the data in the
    timeseries. We could also cache the timeline views in a separate collection
    and just look up from there. The challenge is to then decide when to
    recompute a view - we can't use the standard technique that we use for the
    other stages because we will have to recompute the timeline for the current
    day multiple times, for example.
    """
    # I was going to read from the user cache if it existed there, and recreate
    # from scratch if it didn't. But that would involve adding a getDocument
    # field to the usercache, which I had intentionally not added before this.
    # The problem with adding a getDocument method is that then the usercache
    # is no longer a cache - it is "storage" that is used internally. If we
    # want to do that, we should really store it as a materialized view and not
    # only in the usercache, which should be a cache of values stored elsewhere.
    parsed_dt = dup.parse(day)
    start_dt = ecwl.LocalDate({
        'year': parsed_dt.year,
        'month': parsed_dt.month,
        'day': parsed_dt.day
    })
    end_dt = start_dt
    return gfc.get_geojson_for_dt(user_uuid, start_dt, end_dt)
    def testAug11(self):
        # This is a more complex day. Tests:
        # PR #352 (should not split trip to Oakland)
        # PR #348 (trip from station to OAK DOT)
        # PR #357 (trip to Radio Shack is complete and not truncated)
        # PR #345 (no cleaned trips are skipped)

        dataFile = "emission/tests/data/real_examples/shankari_2016-08-11"
        ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 11})
        cacheKey = "diary/trips-2016-08-11"
        with open(dataFile + ".ground_truth") as gfp:
            ground_truth = json.load(gfp, object_hook=bju.object_hook)

        etc.setupRealExample(self, dataFile)
        etc.runIntakePipeline(self.testUUID)
        # runIntakePipeline does not run the common trips, habitica or store views to cache
        # So let's manually store to the cache
        # tc_query = estt.TimeComponentQuery("data.star_local_dt", ld, ld)
        # enuah.UserCacheHandler.getUserCacheHandler(self.testUUID).storeTimelineToCache(tc_query)

        # cached_result = edb.get_usercache_db().find_one({'user_id': self.testUUID,
        #                                                  "metadata.key": cacheKey})
        api_result = gfc.get_geojson_for_dt(self.testUUID, ld, ld)

        # self.compare_result(cached_result, ground_truth)
        self.compare_result(
            ad.AttrDict({
                'result': api_result
            }).result,
            ad.AttrDict(ground_truth).data)
    def testJun20(self):
        # This is a fairly straightforward day. Tests mainly:
        # - ordering of trips
        # - handling repeated location entries with different write timestamps
        # We have two identical location points with ts = 1466436483.395 and write_ts = 1466436496.4, 1466436497.047
        dataFile = "emission/tests/data/real_examples/shankari_2016-06-20"
        ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 20})
        cacheKey = "diary/trips-2016-06-20"
        with open(dataFile + ".ground_truth") as gfp:
            ground_truth = json.load(gfp, object_hook=bju.object_hook)

        etc.setupRealExample(self, dataFile)
        etc.runIntakePipeline(self.testUUID)
        # runIntakePipeline does not run the common trips, habitica or store views to cache
        # So let's manually store to the cache
        # tc_query = estt.TimeComponentQuery("data.star_local_dt", ld, ld)
        # enuah.UserCacheHandler.getUserCacheHandler(self.testUUID).storeTimelineToCache(tc_query)

        # cached_result = edb.get_usercache_db().find_one({'user_id': self.testUUID,
        #                                                  "metadata.key": cacheKey})
        api_result = gfc.get_geojson_for_dt(self.testUUID, ld, ld)

        # self.compare_result(cached_result, ground_truth)
        self.compare_result(
            ad.AttrDict({
                'result': api_result
            }).result,
            ad.AttrDict(ground_truth).data)
    def testJun21(self):
        # This is a more complex day. Tests:
        # PR #357 (spurious trip at 14:00 should be segmented and skipped)
        # PR #358 (trip back from bella's house at 16:00)

        dataFile = "emission/tests/data/real_examples/shankari_2016-06-21"
        ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 21})
        cacheKey = "diary/trips-2016-06-21"
        with open(dataFile + ".ground_truth") as gfp:
            ground_truth = json.load(gfp, object_hook=bju.object_hook)

        etc.setupRealExample(self, dataFile)
        etc.runIntakePipeline(self.testUUID)
        # runIntakePipeline does not run the common trips, habitica or store views to cache
        # So let's manually store to the cache
        # tc_query = estt.TimeComponentQuery("data.star_local_dt", ld, ld)
        # enuah.UserCacheHandler.getUserCacheHandler(self.testUUID).storeTimelineToCache(tc_query)

        # cached_result = edb.get_usercache_db().find_one({'user_id': self.testUUID,
        #                                                  "metadata.key": cacheKey})
        api_result = gfc.get_geojson_for_dt(self.testUUID, ld, ld)

        # self.compare_result(cached_result, ground_truth)
        self.compare_result(
            ad.AttrDict({
                'result': api_result
            }).result,
            ad.AttrDict(ground_truth).data)
    def testAug10(self):
        # This is a more complex day. Tests:
        # PR #302 (trip to optometrist)
        # PR #352 (split optometrist trip)

        dataFile = "emission/tests/data/real_examples/shankari_2016-08-10"
        ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 10})
        cacheKey = "diary/trips-2016-08-10"
        ground_truth = json.load(open(dataFile + ".ground_truth"),
                                 object_hook=bju.object_hook)

        etc.setupRealExample(self, dataFile)
        etc.runIntakePipeline(self.testUUID)
        # runIntakePipeline does not run the common trips, habitica or store views to cache
        # So let's manually store to the cache
        # tc_query = estt.TimeComponentQuery("data.star_local_dt", ld, ld)
        # enuah.UserCacheHandler.getUserCacheHandler(self.testUUID).storeTimelineToCache(tc_query)

        # cached_result = edb.get_usercache_db().find_one({'user_id': self.testUUID,
        #                                                  "metadata.key": cacheKey})
        api_result = gfc.get_geojson_for_dt(self.testUUID, ld, ld)

        # self.compare_result(cached_result, ground_truth)
        self.compare_result(
            ad.AttrDict({
                'result': api_result
            }).result,
            ad.AttrDict(ground_truth).data)
Esempio n. 25
0
def local_dt_fill_times_yearly(key, section_group_df, metric_summary):
    first_tz = _get_tz(section_group_df)
    ld = ecwl.LocalDate({'year': key, 'timezone': first_tz})
    dt = arrow.Arrow(ld.year, 1, 1, tzinfo=first_tz).floor('year')
    metric_summary.ts = dt.timestamp
    metric_summary.local_dt = ld
    metric_summary.fmt_time = dt.format("YYYY")
Esempio n. 26
0
 def testJun20Postload(self):
     # Same as testJun20Preload, except that the user input arrives after the
     # pipeline is run for the first time, and the matching happens on the
     # next pipeline run
     dataFile = "emission/tests/data/real_examples/shankari_2016-06-20"
     ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 20})
     self.checkConfirmedTripsAndSections(dataFile, ld, preload=False)
Esempio n. 27
0
    def testLocalDate(self):
        import emission.core.wrapper.localdate as ecwl

        test_local = TestWrapper({'a': 1, 'c': 3})
        test_local.write_local_dt = ecwl.LocalDate({'year': 2016, 'month': 4})
        self.assertEqual(test_local.write_local_dt.year, 2016)
        self.assertEqual(test_local.write_local_dt.month, 4)
        with self.assertRaisesRegexp(AttributeError, ".*has no attribute.*"):
            print("the value of day is %s" % test_local.write_local_dt.day)
    def testAug10(self):
        # This is a more complex day. Tests:
        # PR #302 (trip to optometrist)
        # PR #352 (split optometrist trip)

        dataFile = "emission/tests/data/real_examples/shankari_2016-08-10"
        ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 10})
        cacheKey = "diary/trips-2016-08-10"
        self.standardMatchDataGroundTruth(dataFile, ld, cacheKey)
    def testJun21(self):
        # This is a more complex day. Tests:
        # PR #357 (spurious trip at 14:00 should be segmented and skipped)
        # PR #358 (trip back from bella's house at 16:00)

        dataFile = "emission/tests/data/real_examples/shankari_2016-06-21"
        ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 21})
        cacheKey = "diary/trips-2016-06-21"
        self.standardMatchDataGroundTruth(dataFile, ld, cacheKey)
 def testJun20(self):
     # This is a fairly straightforward day. Tests mainly:
     # - ordering of trips
     # - handling repeated location entries with different write timestamps
     # We have two identical location points with ts = 1466436483.395 and write_ts = 1466436496.4, 1466436497.047
     dataFile = "emission/tests/data/real_examples/shankari_2016-06-20"
     ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 20})
     cacheKey = "diary/trips-2016-06-20"
     self.standardMatchDataGroundTruth(dataFile, ld, cacheKey)