def move_all_filters_to_data():
    tsdb = edb.get_timeseries_db()
    for entry in tsdb.find():
        if "filter" in entry["metadata"]:
            curr_filter = entry["metadata"]["filter"]
            if is_location_entry(entry):
                entry["data"]["filter"] = curr_filter
                logging.debug(
                    "for entry %s, found key %s, moved filter %s into data" %
                    (entry["_id"], get_curr_key(entry), curr_filter))

            # For all cases, including the location one, we want to delete the filter from metadata
            del entry["metadata"]["filter"]
            edb.save(tsdb, entry)
            logging.debug(
                "for entry %s, for key %s, deleted filter %s from metadata" %
                (entry["_id"], get_curr_key(entry), curr_filter))
        else:
            pass
            # logging.warning("No filter found for entry %s, skipping" % entry)

        if "filter" not in entry["data"] and is_location_entry(entry):
            # This must be an entry from before the time that we started sending
            # entries to the server. At that time, we only sent time entries,
            # so set it to time in this case
            entry["data"]["filter"] = "time"
            logging.debug(
                "No entry found in either data or metadata, for key %s setting to 'time'"
                % entry["metadata"]["key"])
            edb.save(tsdb, entry)
def mark_stage_done(user_id, stage, last_processed_ts):
    # We move failed entries to the error timeseries. So usercache runs never fail.
    curr_state = get_current_state(user_id, stage)
    assert(curr_state is not None)
    assert(curr_state.curr_run_ts is not None)
    curr_state.last_ts_run = curr_state.curr_run_ts
    # It is incorrect to assume that we have processed all the data until the
    # start of the last run. In particular, due to network connectivity or
    # other issues, it is possible that there is outstanding data on phones
    # that was collected before the last run started. And if we set this, then
    # that data will simply be skipped. The same logic applies to all
    # decorators that are based on client collected data (trip start ts, etc) -
    # it is only accurate for server generated data. So for maximum generality,
    # let's allow the stage to pass in last_processed_ts.
    if last_processed_ts is not None:
        logging.info("For stage %s, last_ts_processed = %s" %
                     (stage, pydt.datetime.utcfromtimestamp(last_processed_ts).isoformat()))
        curr_state.last_processed_ts = last_processed_ts
    else:
        logging.info("For stage %s, last_ts_processed is unchanged" % stage)
    curr_state.curr_run_ts = None
    logging.debug("About to save object %s" % curr_state)
    edb.save(edb.get_pipeline_state_db(), curr_state)
    logging.debug("After saving state %s, list is %s" % (curr_state,
        list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
def move_all_filters_to_data():
    tsdb = edb.get_timeseries_db()
    for entry in tsdb.find():
        if "filter" in entry["metadata"]:
            curr_filter = entry["metadata"]["filter"]
            if is_location_entry(entry):
                entry["data"]["filter"] = curr_filter
                logging.debug("for entry %s, found key %s, moved filter %s into data" % 
                                (entry["_id"], get_curr_key(entry), curr_filter))

            # For all cases, including the location one, we want to delete the filter from metadata
            del entry["metadata"]["filter"]
            edb.save(tsdb, entry)
            logging.debug("for entry %s, for key %s, deleted filter %s from metadata" % 
                            (entry["_id"], get_curr_key(entry), curr_filter))
        else:
            pass
            # logging.warning("No filter found for entry %s, skipping" % entry)

        if "filter" not in entry["data"] and is_location_entry(entry):
            # This must be an entry from before the time that we started sending
            # entries to the server. At that time, we only sent time entries,
            # so set it to time in this case
            entry["data"]["filter"] = "time"
            logging.debug("No entry found in either data or metadata, for key %s setting to 'time'" % entry["metadata"]["key"])
            edb.save(tsdb, entry)
def mark_stage_done(user_id, stage, last_processed_ts):
    # We move failed entries to the error timeseries. So usercache runs never fail.
    curr_state = get_current_state(user_id, stage)
    assert(curr_state is not None)
    assert(curr_state.curr_run_ts is not None)
    curr_state.last_ts_run = curr_state.curr_run_ts
    # It is incorrect to assume that we have processed all the data until the
    # start of the last run. In particular, due to network connectivity or
    # other issues, it is possible that there is outstanding data on phones
    # that was collected before the last run started. And if we set this, then
    # that data will simply be skipped. The same logic applies to all
    # decorators that are based on client collected data (trip start ts, etc) -
    # it is only accurate for server generated data. So for maximum generality,
    # let's allow the stage to pass in last_processed_ts.
    if last_processed_ts is not None:
        logging.info("For stage %s, last_ts_processed = %s" %
                     (stage, pydt.datetime.utcfromtimestamp(last_processed_ts).isoformat()))
        curr_state.last_processed_ts = last_processed_ts
    else:
        logging.info("For stage %s, last_ts_processed is unchanged" % stage)
    curr_state.curr_run_ts = None
    logging.debug("About to save object %s" % curr_state)
    edb.save(edb.get_pipeline_state_db(), curr_state)
    logging.debug("After saving state %s, list is %s" % (curr_state,
        list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
Example #5
0
    def setUp(self):
        self.testUsers = [
            "*****@*****.**", "*****@*****.**", "*****@*****.**",
            "*****@*****.**", "*****@*****.**"
        ]
        self.serverName = edb.url

        self.ModesColl = get_mode_db()
        self.SectionsColl = get_section_db()

        # Let's make sure that the users are registered so that they have profiles
        user_objects = []
        for userEmail in self.testUsers:
            user_objects.append(User.register(userEmail))

        # Sometimes, we may have entries left behind in the database if one of the tests failed
        # or threw an exception, so let us start by cleaning up all entries
        for testUser in user_objects:
            etc.purgeSectionData(self.SectionsColl, testUser.uuid)

        if self.ModesColl.estimated_document_count() > 0:
            self.ModesColl.delete_many({})

        self.assertEqual(self.ModesColl.estimated_document_count(), 0)

        self.assertEqual(self.SectionsColl.estimated_document_count(), 0)

        MongoClient(edb.url).drop_database("Backup_database")

        etc.loadTable(self.serverName, "Stage_Modes",
                      "emission/tests/data/modes.json")
        etc.loadTable(self.serverName, "Stage_Sections",
                      "emission/tests/data/testModeInferSeedFile")

        self.now = datetime.now()
        self.dayago = self.now - timedelta(days=1)
        self.weekago = self.now - timedelta(weeks=1)

        for section in self.SectionsColl.find():
            section['section_start_datetime'] = self.dayago
            section['section_end_datetime'] = self.dayago + timedelta(hours=1)
            if (section['confirmed_mode'] == 5):
                # We only cluster bus and train trips
                # And our test data only has bus trips
                section['section_start_point'] = {
                    u'type': u'Point',
                    u'coordinates': [-122.270039042, 37.8800285728]
                }
                section['section_end_point'] = {
                    u'type': u'Point',
                    u'coordinates': [-122.2690412952, 37.8739578595]
                }
            # print("Section start = %s, section end = %s" %
            #   (section['section_start_datetime'], section['section_end_datetime']))
            # Replace the user email with the UUID
            section['user_id'] = User.fromEmail(section['user_id']).uuid
            edb.save(self.SectionsColl, section)

        self.pipeline = pipeline.ModeInferencePipelineMovesFormat()
        self.testLoadTrainingData()
    def loadPointsForTrip(self, trip_id):
        import emission.core.get_database as edb

        with open("emission/tests/data/smoothing_data/%s" % trip_id) as pfp:
            entries = json.load(pfp, object_hook=bju.object_hook)
        tsdb = edb.get_timeseries_db()
        for entry in entries:
            entry["user_id"] = self.testUUID
            edb.save(tsdb, entry)
 def update(entry):
     """
     Save the specified entry. In general, our entries are read-only, so
     this should only be called under very rare conditions. Once we identify
     what these conditions are, we should consider replacing them with
     versioned objects
     """
     logging.debug("update called")
     ts = esta.TimeSeries.get_time_series(entry.user_id)
     logging.debug("Saving entry %s into timeseries" % entry)
     edb.save(ts.get_timeseries_db(entry.metadata.key), entry)
Example #8
0
 def update(entry):
     """
     Save the specified entry. In general, our entries are read-only, so
     this should only be called under very rare conditions. Once we identify
     what these conditions are, we should consider replacing them with
     versioned objects
     """
     logging.debug("update called")
     ts = esta.TimeSeries.get_time_series(entry.user_id)
     logging.debug("Saving entry %s into timeseries" % entry)
     edb.save(ts.get_timeseries_db(entry.metadata.key), entry)
def mark_stage_failed(user_id, stage):
    curr_state = get_current_state(user_id, stage)
    assert(curr_state is not None)
    assert(curr_state.curr_run_ts is not None)
    # last_ts_run remains unchanged since this run did not succeed
    # the next query will start from the start_ts of this run
    # we also reset the curr_run_ts to indicate that we are not currently running
    curr_state.curr_run_ts = None
    logging.debug("About to save object %s" % curr_state)
    edb.save(edb.get_pipeline_state_db(), curr_state)
    logging.debug("After saving state %s, list is %s" % (curr_state,
        list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
def mark_stage_failed(user_id, stage):
    curr_state = get_current_state(user_id, stage)
    assert(curr_state is not None)
    assert(curr_state.curr_run_ts is not None)
    # last_ts_run remains unchanged since this run did not succeed
    # the next query will start from the start_ts of this run
    # we also reset the curr_run_ts to indicate that we are not currently running
    curr_state.curr_run_ts = None
    logging.debug("About to save object %s" % curr_state)
    edb.save(edb.get_pipeline_state_db(), curr_state)
    logging.debug("After saving state %s, list is %s" % (curr_state,
        list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
Example #11
0
def setupRealExampleWithEntries(testObj):
    tsdb = edb.get_timeseries_db()
    for entry in testObj.entries:
        entry["user_id"] = testObj.testUUID
        # print "Saving entry with write_ts = %s and ts = %s" % (entry["metadata"]["write_fmt_time"],
        #                                                        entry["data"]["fmt_time"])
        edb.save(tsdb, entry)
        
    logging.info("After loading, timeseries db size = %s" % edb.get_timeseries_db().count())
    logging.debug("First few entries = %s" % 
                    [e["data"]["fmt_time"] if "fmt_time" in e["data"] else e["metadata"]["write_fmt_time"] for e in 
                        list(edb.get_timeseries_db().find({"user_id": testObj.testUUID}).sort("data.write_ts",
                                                                                       pymongo.ASCENDING).limit(10))])
Example #12
0
 def update_data(user_id, key, obj_id, data):
     """
     Save the specified entry. In general, our entries are read-only, so
     this should only be called under very rare conditions. Once we identify
     what these conditions are, we should consider replacing them with
     versioned objects
     """
     logging.debug("update_data called")
     ts = esta.TimeSeries.get_time_series(user_id)
     new_entry = ecwe.Entry.create_entry(user_id, key, data)
     # Make sure that we update the existing entry instead of creating a new one
     new_entry['_id'] = obj_id
     logging.debug("updating entry %s into timeseries" % new_entry)
     edb.save(ts.get_timeseries_db(key), new_entry)
 def update_data(user_id, key, obj_id, data):
     """
     Save the specified entry. In general, our entries are read-only, so
     this should only be called under very rare conditions. Once we identify
     what these conditions are, we should consider replacing them with
     versioned objects
     """
     logging.debug("update_data called")
     ts = esta.TimeSeries.get_time_series(user_id)
     new_entry = ecwe.Entry.create_entry(user_id, key, data)
     # Make sure that we update the existing entry instead of creating a new one
     new_entry['_id'] = obj_id
     logging.debug("updating entry %s into timeseries" % new_entry)
     edb.save(ts.get_timeseries_db(key), new_entry)
Example #14
0
def setupRealExampleWithEntries(testObj):
    tsdb = edb.get_timeseries_db()
    for entry in testObj.entries:
        entry["user_id"] = testObj.testUUID
        # print "Saving entry with write_ts = %s and ts = %s" % (entry["metadata"]["write_fmt_time"],
        #                                                        entry["data"]["fmt_time"])
        edb.save(tsdb, entry)

    logging.info("After loading, timeseries db size = %s" %
                 edb.get_timeseries_db().count())
    logging.debug("First few entries = %s" % [
        e["data"]["fmt_time"]
        if "fmt_time" in e["data"] else e["metadata"]["write_fmt_time"]
        for e in list(edb.get_timeseries_db().find({
            "user_id": testObj.testUUID
        }).sort("data.write_ts", pymongo.ASCENDING).limit(10))
    ])
Example #15
0
def get_time_range_for_stage(user_id, stage):
    """
    Returns the start ts and the end ts of the entries in the stage
    """
    curr_state = get_current_state(user_id, stage)

    if curr_state is None:
        start_ts = None
        curr_state = ps.PipelineState()
        curr_state.user_id = user_id
        curr_state.pipeline_stage = stage
        curr_state.curr_run_ts = None
        curr_state.last_processed_ts = None
        curr_state.last_ts_run = None
    else:
        start_ts = curr_state.last_processed_ts

    if start_ts is None:
        logging.info("For stage %s, start_ts is None" % stage)
    else:
        logging.info(
            "For stage %s, start_ts = %s" %
            (stage, pydt.datetime.utcfromtimestamp(start_ts).isoformat()))

    assert curr_state.curr_run_ts is None, "curr_state.curr_run_ts = %s" % curr_state.curr_run_ts
    # Let's pick a point 5 secs in the past. If we don't do this, then we will
    # read all entries upto the current ts and this may lead to lost data. For
    # example, let us say that the current ts is t1. At the time that we read
    # the data, we have 4 entries for t1. By the time we finish copying, we
    # have 6 entries for t1, we will end up deleting all 6, which will lose 2
    # entries.
    end_ts = time.time() - END_FUZZ_AVOID_LTE

    ret_query = estt.TimeQuery("metadata.write_ts", start_ts, end_ts)

    curr_state.curr_run_ts = end_ts
    logging.debug("About to save object %s" % curr_state)
    edb.save(edb.get_pipeline_state_db(), curr_state)
    logging.debug(
        "After saving state %s, list is %s" %
        (curr_state,
         list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
    return ret_query
def get_time_range_for_stage(user_id, stage):
    """
    Returns the start ts and the end ts of the entries in the stage
    """
    curr_state = get_current_state(user_id, stage)

    if curr_state is None:
        start_ts = None
        curr_state = ps.PipelineState()
        curr_state.user_id = user_id
        curr_state.pipeline_stage = stage
        curr_state.curr_run_ts = None
        curr_state.last_processed_ts = None
        curr_state.last_ts_run = None
    else:
        start_ts = curr_state.last_processed_ts

    if start_ts is None:
        logging.info("For stage %s, start_ts is None" % stage)
    else:
        logging.info("For stage %s, start_ts = %s" % (stage, pydt.datetime.utcfromtimestamp(start_ts).isoformat()))

    assert curr_state.curr_run_ts is None, "curr_state.curr_run_ts = %s" % curr_state.curr_run_ts
    # Let's pick a point 5 secs in the past. If we don't do this, then we will
    # read all entries upto the current ts and this may lead to lost data. For
    # example, let us say that the current ts is t1. At the time that we read
    # the data, we have 4 entries for t1. By the time we finish copying, we
    # have 6 entries for t1, we will end up deleting all 6, which will lose 2
    # entries.
    end_ts = time.time() - END_FUZZ_AVOID_LTE

    ret_query = estt.TimeQuery("metadata.write_ts", start_ts, end_ts)

    curr_state.curr_run_ts = end_ts
    logging.debug("About to save object %s" % curr_state)
    edb.save(edb.get_pipeline_state_db(), curr_state)
    logging.debug("After saving state %s, list is %s" % (curr_state,
        list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
    return ret_query
Example #17
0
def save_common_place(common_place):
    edb.save(edb.get_common_place_db(), common_place)
Example #18
0
def save_user_entry(user_id, user_entry):
    assert (user_entry["user_id"] == user_id)
    return edb.save(edb.get_habitica_db(), user_entry)
Example #19
0
    def testSegmentationWrapperCombined(self):
        # Change iOS entries to have the android UUID
        tsdb = edb.get_timeseries_db()
        for entry in esta.TimeSeries.get_time_series(
                self.iosUUID).find_entries():
            entry["user_id"] = self.androidUUID
            edb.save(tsdb, entry)
        
        # Now, segment the data for the combined UUID, which will include both
        # android and ios
        eaist.segment_current_trips(self.androidUUID)

        tq_place = estt.TimeQuery("data.enter_ts", 1440658800, 1446847600)
        created_places_entries = esda.get_entries(esda.RAW_PLACE_KEY,
                                                  self.androidUUID, tq_place)

        tq_trip = estt.TimeQuery("data.start_ts", 1440658800, 1446847600)
        created_trips_entries = esda.get_entries(esda.RAW_TRIP_KEY,
                                                 self.androidUUID, tq_trip,
                                                 untracked_key=esda.RAW_UNTRACKED_KEY)

        for i, place in enumerate(created_places_entries):
            logging.debug("Retrieved places %s: %s -> %s" % (i, place.data.enter_fmt_time, place.data.exit_fmt_time))
        for i, trip in enumerate(created_trips_entries):
            logging.debug("Retrieved trips %s: %s -> %s" % (i, trip.data.start_fmt_time, trip.data.end_fmt_time))

        # We expect there to be 12 places, but the first one is that start of
        # the chain, so it has a start_time of None and it won't be retrieved
        # by the query on the start_time that we show here.
        self.assertEqual(len(created_places_entries), 11)
        self.assertEqual(len(created_trips_entries), 11)

        # Pick the first two trips and the first place and ensure that they are all linked correctly
        # Note that this is the first place, not the second place because the true first place will not
        # be retrieved by the query, as shown above
        # The first trip here is a dummy trip, so let's check the second and third trip instead
        trip0time = created_trips_entries[0]
        trip1time = created_trips_entries[1]
        place0time = created_places_entries[0]
        
        self.assertEqual(trip0time.data.end_place, place0time.get_id())
        self.assertEqual(trip1time.data.start_place, place0time.get_id())
        self.assertEqual(place0time.data.ending_trip, trip0time.get_id())
        self.assertEqual(place0time.data.starting_trip, trip1time.get_id())

        self.assertEqual(round(trip0time.data.duration), 11 * 60 + 9)
        self.assertEqual(round(trip1time.data.duration), 6 * 60 + 54)

        self.assertIsNotNone(place0time.data.location)
        
        # There are 9 android "trips" first (index: 0-8), including the untracked time
        # index 9 is the short, bogus trip
        # So we want to check trips 10 and 11
        trip0dist = created_trips_entries[9]
        trip1dist = created_trips_entries[10]
        place0dist = created_places_entries[9]
        
        self.assertEqual(trip0dist.data.end_place, place0dist.get_id())
        self.assertEqual(trip1dist.data.start_place, place0dist.get_id())
        self.assertEqual(place0dist.data.ending_trip, trip0dist.get_id())
        self.assertEqual(place0dist.data.starting_trip, trip1dist.get_id())

        self.assertEqual(round(trip0dist.data.duration), 14 * 60 + 41)
        self.assertEqual(round(trip1dist.data.duration), 1 * 60 * 60 + 50 * 60 + 56)

        self.assertIsNotNone(place0dist.data.location)
Example #20
0
def save_user_entry(user_id, user_entry):
  assert(user_entry["user_id"] == user_id)
  return edb.save(edb.get_habitica_db(), user_entry)