def move_all_filters_to_data(): tsdb = edb.get_timeseries_db() for entry in tsdb.find(): if "filter" in entry["metadata"]: curr_filter = entry["metadata"]["filter"] if is_location_entry(entry): entry["data"]["filter"] = curr_filter logging.debug( "for entry %s, found key %s, moved filter %s into data" % (entry["_id"], get_curr_key(entry), curr_filter)) # For all cases, including the location one, we want to delete the filter from metadata del entry["metadata"]["filter"] edb.save(tsdb, entry) logging.debug( "for entry %s, for key %s, deleted filter %s from metadata" % (entry["_id"], get_curr_key(entry), curr_filter)) else: pass # logging.warning("No filter found for entry %s, skipping" % entry) if "filter" not in entry["data"] and is_location_entry(entry): # This must be an entry from before the time that we started sending # entries to the server. At that time, we only sent time entries, # so set it to time in this case entry["data"]["filter"] = "time" logging.debug( "No entry found in either data or metadata, for key %s setting to 'time'" % entry["metadata"]["key"]) edb.save(tsdb, entry)
def mark_stage_done(user_id, stage, last_processed_ts): # We move failed entries to the error timeseries. So usercache runs never fail. curr_state = get_current_state(user_id, stage) assert(curr_state is not None) assert(curr_state.curr_run_ts is not None) curr_state.last_ts_run = curr_state.curr_run_ts # It is incorrect to assume that we have processed all the data until the # start of the last run. In particular, due to network connectivity or # other issues, it is possible that there is outstanding data on phones # that was collected before the last run started. And if we set this, then # that data will simply be skipped. The same logic applies to all # decorators that are based on client collected data (trip start ts, etc) - # it is only accurate for server generated data. So for maximum generality, # let's allow the stage to pass in last_processed_ts. if last_processed_ts is not None: logging.info("For stage %s, last_ts_processed = %s" % (stage, pydt.datetime.utcfromtimestamp(last_processed_ts).isoformat())) curr_state.last_processed_ts = last_processed_ts else: logging.info("For stage %s, last_ts_processed is unchanged" % stage) curr_state.curr_run_ts = None logging.debug("About to save object %s" % curr_state) edb.save(edb.get_pipeline_state_db(), curr_state) logging.debug("After saving state %s, list is %s" % (curr_state, list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
def move_all_filters_to_data(): tsdb = edb.get_timeseries_db() for entry in tsdb.find(): if "filter" in entry["metadata"]: curr_filter = entry["metadata"]["filter"] if is_location_entry(entry): entry["data"]["filter"] = curr_filter logging.debug("for entry %s, found key %s, moved filter %s into data" % (entry["_id"], get_curr_key(entry), curr_filter)) # For all cases, including the location one, we want to delete the filter from metadata del entry["metadata"]["filter"] edb.save(tsdb, entry) logging.debug("for entry %s, for key %s, deleted filter %s from metadata" % (entry["_id"], get_curr_key(entry), curr_filter)) else: pass # logging.warning("No filter found for entry %s, skipping" % entry) if "filter" not in entry["data"] and is_location_entry(entry): # This must be an entry from before the time that we started sending # entries to the server. At that time, we only sent time entries, # so set it to time in this case entry["data"]["filter"] = "time" logging.debug("No entry found in either data or metadata, for key %s setting to 'time'" % entry["metadata"]["key"]) edb.save(tsdb, entry)
def setUp(self): self.testUsers = [ "*****@*****.**", "*****@*****.**", "*****@*****.**", "*****@*****.**", "*****@*****.**" ] self.serverName = edb.url self.ModesColl = get_mode_db() self.SectionsColl = get_section_db() # Let's make sure that the users are registered so that they have profiles user_objects = [] for userEmail in self.testUsers: user_objects.append(User.register(userEmail)) # Sometimes, we may have entries left behind in the database if one of the tests failed # or threw an exception, so let us start by cleaning up all entries for testUser in user_objects: etc.purgeSectionData(self.SectionsColl, testUser.uuid) if self.ModesColl.estimated_document_count() > 0: self.ModesColl.delete_many({}) self.assertEqual(self.ModesColl.estimated_document_count(), 0) self.assertEqual(self.SectionsColl.estimated_document_count(), 0) MongoClient(edb.url).drop_database("Backup_database") etc.loadTable(self.serverName, "Stage_Modes", "emission/tests/data/modes.json") etc.loadTable(self.serverName, "Stage_Sections", "emission/tests/data/testModeInferSeedFile") self.now = datetime.now() self.dayago = self.now - timedelta(days=1) self.weekago = self.now - timedelta(weeks=1) for section in self.SectionsColl.find(): section['section_start_datetime'] = self.dayago section['section_end_datetime'] = self.dayago + timedelta(hours=1) if (section['confirmed_mode'] == 5): # We only cluster bus and train trips # And our test data only has bus trips section['section_start_point'] = { u'type': u'Point', u'coordinates': [-122.270039042, 37.8800285728] } section['section_end_point'] = { u'type': u'Point', u'coordinates': [-122.2690412952, 37.8739578595] } # print("Section start = %s, section end = %s" % # (section['section_start_datetime'], section['section_end_datetime'])) # Replace the user email with the UUID section['user_id'] = User.fromEmail(section['user_id']).uuid edb.save(self.SectionsColl, section) self.pipeline = pipeline.ModeInferencePipelineMovesFormat() self.testLoadTrainingData()
def loadPointsForTrip(self, trip_id): import emission.core.get_database as edb with open("emission/tests/data/smoothing_data/%s" % trip_id) as pfp: entries = json.load(pfp, object_hook=bju.object_hook) tsdb = edb.get_timeseries_db() for entry in entries: entry["user_id"] = self.testUUID edb.save(tsdb, entry)
def update(entry): """ Save the specified entry. In general, our entries are read-only, so this should only be called under very rare conditions. Once we identify what these conditions are, we should consider replacing them with versioned objects """ logging.debug("update called") ts = esta.TimeSeries.get_time_series(entry.user_id) logging.debug("Saving entry %s into timeseries" % entry) edb.save(ts.get_timeseries_db(entry.metadata.key), entry)
def mark_stage_failed(user_id, stage): curr_state = get_current_state(user_id, stage) assert(curr_state is not None) assert(curr_state.curr_run_ts is not None) # last_ts_run remains unchanged since this run did not succeed # the next query will start from the start_ts of this run # we also reset the curr_run_ts to indicate that we are not currently running curr_state.curr_run_ts = None logging.debug("About to save object %s" % curr_state) edb.save(edb.get_pipeline_state_db(), curr_state) logging.debug("After saving state %s, list is %s" % (curr_state, list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
def setupRealExampleWithEntries(testObj): tsdb = edb.get_timeseries_db() for entry in testObj.entries: entry["user_id"] = testObj.testUUID # print "Saving entry with write_ts = %s and ts = %s" % (entry["metadata"]["write_fmt_time"], # entry["data"]["fmt_time"]) edb.save(tsdb, entry) logging.info("After loading, timeseries db size = %s" % edb.get_timeseries_db().count()) logging.debug("First few entries = %s" % [e["data"]["fmt_time"] if "fmt_time" in e["data"] else e["metadata"]["write_fmt_time"] for e in list(edb.get_timeseries_db().find({"user_id": testObj.testUUID}).sort("data.write_ts", pymongo.ASCENDING).limit(10))])
def update_data(user_id, key, obj_id, data): """ Save the specified entry. In general, our entries are read-only, so this should only be called under very rare conditions. Once we identify what these conditions are, we should consider replacing them with versioned objects """ logging.debug("update_data called") ts = esta.TimeSeries.get_time_series(user_id) new_entry = ecwe.Entry.create_entry(user_id, key, data) # Make sure that we update the existing entry instead of creating a new one new_entry['_id'] = obj_id logging.debug("updating entry %s into timeseries" % new_entry) edb.save(ts.get_timeseries_db(key), new_entry)
def setupRealExampleWithEntries(testObj): tsdb = edb.get_timeseries_db() for entry in testObj.entries: entry["user_id"] = testObj.testUUID # print "Saving entry with write_ts = %s and ts = %s" % (entry["metadata"]["write_fmt_time"], # entry["data"]["fmt_time"]) edb.save(tsdb, entry) logging.info("After loading, timeseries db size = %s" % edb.get_timeseries_db().count()) logging.debug("First few entries = %s" % [ e["data"]["fmt_time"] if "fmt_time" in e["data"] else e["metadata"]["write_fmt_time"] for e in list(edb.get_timeseries_db().find({ "user_id": testObj.testUUID }).sort("data.write_ts", pymongo.ASCENDING).limit(10)) ])
def get_time_range_for_stage(user_id, stage): """ Returns the start ts and the end ts of the entries in the stage """ curr_state = get_current_state(user_id, stage) if curr_state is None: start_ts = None curr_state = ps.PipelineState() curr_state.user_id = user_id curr_state.pipeline_stage = stage curr_state.curr_run_ts = None curr_state.last_processed_ts = None curr_state.last_ts_run = None else: start_ts = curr_state.last_processed_ts if start_ts is None: logging.info("For stage %s, start_ts is None" % stage) else: logging.info( "For stage %s, start_ts = %s" % (stage, pydt.datetime.utcfromtimestamp(start_ts).isoformat())) assert curr_state.curr_run_ts is None, "curr_state.curr_run_ts = %s" % curr_state.curr_run_ts # Let's pick a point 5 secs in the past. If we don't do this, then we will # read all entries upto the current ts and this may lead to lost data. For # example, let us say that the current ts is t1. At the time that we read # the data, we have 4 entries for t1. By the time we finish copying, we # have 6 entries for t1, we will end up deleting all 6, which will lose 2 # entries. end_ts = time.time() - END_FUZZ_AVOID_LTE ret_query = estt.TimeQuery("metadata.write_ts", start_ts, end_ts) curr_state.curr_run_ts = end_ts logging.debug("About to save object %s" % curr_state) edb.save(edb.get_pipeline_state_db(), curr_state) logging.debug( "After saving state %s, list is %s" % (curr_state, list(edb.get_pipeline_state_db().find({"user_id": user_id})))) return ret_query
def get_time_range_for_stage(user_id, stage): """ Returns the start ts and the end ts of the entries in the stage """ curr_state = get_current_state(user_id, stage) if curr_state is None: start_ts = None curr_state = ps.PipelineState() curr_state.user_id = user_id curr_state.pipeline_stage = stage curr_state.curr_run_ts = None curr_state.last_processed_ts = None curr_state.last_ts_run = None else: start_ts = curr_state.last_processed_ts if start_ts is None: logging.info("For stage %s, start_ts is None" % stage) else: logging.info("For stage %s, start_ts = %s" % (stage, pydt.datetime.utcfromtimestamp(start_ts).isoformat())) assert curr_state.curr_run_ts is None, "curr_state.curr_run_ts = %s" % curr_state.curr_run_ts # Let's pick a point 5 secs in the past. If we don't do this, then we will # read all entries upto the current ts and this may lead to lost data. For # example, let us say that the current ts is t1. At the time that we read # the data, we have 4 entries for t1. By the time we finish copying, we # have 6 entries for t1, we will end up deleting all 6, which will lose 2 # entries. end_ts = time.time() - END_FUZZ_AVOID_LTE ret_query = estt.TimeQuery("metadata.write_ts", start_ts, end_ts) curr_state.curr_run_ts = end_ts logging.debug("About to save object %s" % curr_state) edb.save(edb.get_pipeline_state_db(), curr_state) logging.debug("After saving state %s, list is %s" % (curr_state, list(edb.get_pipeline_state_db().find({"user_id": user_id})))) return ret_query
def save_common_place(common_place): edb.save(edb.get_common_place_db(), common_place)
def save_user_entry(user_id, user_entry): assert (user_entry["user_id"] == user_id) return edb.save(edb.get_habitica_db(), user_entry)
def testSegmentationWrapperCombined(self): # Change iOS entries to have the android UUID tsdb = edb.get_timeseries_db() for entry in esta.TimeSeries.get_time_series( self.iosUUID).find_entries(): entry["user_id"] = self.androidUUID edb.save(tsdb, entry) # Now, segment the data for the combined UUID, which will include both # android and ios eaist.segment_current_trips(self.androidUUID) tq_place = estt.TimeQuery("data.enter_ts", 1440658800, 1446847600) created_places_entries = esda.get_entries(esda.RAW_PLACE_KEY, self.androidUUID, tq_place) tq_trip = estt.TimeQuery("data.start_ts", 1440658800, 1446847600) created_trips_entries = esda.get_entries(esda.RAW_TRIP_KEY, self.androidUUID, tq_trip, untracked_key=esda.RAW_UNTRACKED_KEY) for i, place in enumerate(created_places_entries): logging.debug("Retrieved places %s: %s -> %s" % (i, place.data.enter_fmt_time, place.data.exit_fmt_time)) for i, trip in enumerate(created_trips_entries): logging.debug("Retrieved trips %s: %s -> %s" % (i, trip.data.start_fmt_time, trip.data.end_fmt_time)) # We expect there to be 12 places, but the first one is that start of # the chain, so it has a start_time of None and it won't be retrieved # by the query on the start_time that we show here. self.assertEqual(len(created_places_entries), 11) self.assertEqual(len(created_trips_entries), 11) # Pick the first two trips and the first place and ensure that they are all linked correctly # Note that this is the first place, not the second place because the true first place will not # be retrieved by the query, as shown above # The first trip here is a dummy trip, so let's check the second and third trip instead trip0time = created_trips_entries[0] trip1time = created_trips_entries[1] place0time = created_places_entries[0] self.assertEqual(trip0time.data.end_place, place0time.get_id()) self.assertEqual(trip1time.data.start_place, place0time.get_id()) self.assertEqual(place0time.data.ending_trip, trip0time.get_id()) self.assertEqual(place0time.data.starting_trip, trip1time.get_id()) self.assertEqual(round(trip0time.data.duration), 11 * 60 + 9) self.assertEqual(round(trip1time.data.duration), 6 * 60 + 54) self.assertIsNotNone(place0time.data.location) # There are 9 android "trips" first (index: 0-8), including the untracked time # index 9 is the short, bogus trip # So we want to check trips 10 and 11 trip0dist = created_trips_entries[9] trip1dist = created_trips_entries[10] place0dist = created_places_entries[9] self.assertEqual(trip0dist.data.end_place, place0dist.get_id()) self.assertEqual(trip1dist.data.start_place, place0dist.get_id()) self.assertEqual(place0dist.data.ending_trip, trip0dist.get_id()) self.assertEqual(place0dist.data.starting_trip, trip1dist.get_id()) self.assertEqual(round(trip0dist.data.duration), 14 * 60 + 41) self.assertEqual(round(trip1dist.data.duration), 1 * 60 * 60 + 50 * 60 + 56) self.assertIsNotNone(place0dist.data.location)
def save_user_entry(user_id, user_entry): assert(user_entry["user_id"] == user_id) return edb.save(edb.get_habitica_db(), user_entry)