def tearDown(self): edb.get_analysis_timeseries_db().remove({'user_id': self.testUUID}) del_result = proxy.habiticaProxy(self.testUUID, "DELETE", "/api/v3/user", {'password': "******"}) edb.get_habitica_db().remove({'user_id': self.testUUID}) logging.debug("in tearDown, result = %s" % del_result)
def del_objects_after(user_id, reset_ts, is_dry_run): del_query = {} # handle the user del_query.update({"user_id": user_id}) del_query.update({"metadata.key": {"$in": ["inference/prediction", "analysis/inferred_section"]}}) # all objects inserted here have start_ts and end_ts and are trip-like del_query.update({"data.start_ts": {"$gt": reset_ts}}) logging.debug("After all updates, del_query = %s" % del_query) reset_pipeline_query = {"user_id": user_id, "pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value} # Fuzz the TRIP_SEGMENTATION stage 5 mins because of # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312730217 FUZZ_FACTOR = 5 * 60 reset_pipeline_update = {'$set': {'last_processed_ts': reset_ts + FUZZ_FACTOR}} logging.info("About to reset stage %s to %s" % (ecwp.PipelineStages.MODE_INFERENCE, reset_ts)) logging.info("About to delete %d entries" % edb.get_analysis_timeseries_db().find(del_query).count()) logging.info("About to delete entries with keys %s" % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key")) if is_dry_run: logging.info("this is a dry-run, returning from del_objects_after without modifying anything") else: result = edb.get_analysis_timeseries_db().remove(del_query) logging.info("this is not a dry-run, result of deleting analysis entries is %s" % result) result = edb.get_pipeline_state_db().update_one(reset_pipeline_query, reset_pipeline_update) logging.info("this is not a dry-run, result of updating pipeline state is %s" % result.raw_result)
def tearDown(self): edb.get_timeseries_db().remove({"user_id": self.testUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.testUUID}) edb.get_timeseries_db().remove({"user_id": "new_fake"}) edb.get_analysis_timeseries_db().remove({"user_id": "new_fake"}) edb.get_common_trip_db().drop() edb.get_common_place_db().drop()
def del_objects_after(user_id, reset_ts, is_dry_run): del_query = {} # handle the user del_query.update({"user_id": user_id}) del_query.update({"metadata.key": {"$in": ["inference/prediction", "analysis/inferred_section"]}}) # all objects inserted here have start_ts and end_ts and are trip-like del_query.update({"data.start_ts": {"$gt": reset_ts}}) logging.debug("After all updates, del_query = %s" % del_query) reset_pipeline_query = {"pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value} # Fuzz the TRIP_SEGMENTATION stage 5 mins because of # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312730217 FUZZ_FACTOR = 5 * 60 reset_pipeline_update = {'$set': {'last_processed_ts': reset_ts + FUZZ_FACTOR}} logging.info("About to reset stage %s to %s" % (ecwp.PipelineStages.MODE_INFERENCE, reset_ts)) logging.info("About to delete %d entries" % edb.get_analysis_timeseries_db().find(del_query).count()) logging.info("About to delete entries with keys %s" % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key")) if is_dry_run: logging.info("this is a dry-run, returning from del_objects_after without modifying anything") else: result = edb.get_analysis_timeseries_db().remove(del_query) logging.info("this is not a dry-run, result of deleting analysis entries is %s" % result)
def clearRelatedDb(self): edb.get_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_pipeline_state_db().remove({"user_id": self.androidUUID}) edb.get_timeseries_db().remove({"user_id": self.iosUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.iosUUID}) edb.get_pipeline_state_db().remove({"user_id": self.iosUUID})
def tearDown(self): edb.get_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_timeseries_db().remove({"user_id": self.iosUUID}) edb.get_usercache_db().remove({"user_id": self.androidUUID}) edb.get_usercache_db().remove({"user_id": self.iosUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.iosUUID})
def tearDown(self): edb.get_analysis_timeseries_db().remove({'user_id': self.testUUID}) del_result = proxy.habiticaProxy(self.testUUID, "DELETE", "/api/v3/user", {'password': "******"}) edb.get_habitica_db().remove({'user_id': self.testUUID}) logging.debug("in tearDown, result = %s" % del_result)
def del_all_objects(is_dry_run): del_query = {} del_query.update({ "metadata.key": { "$in": ["inference/prediction", "analysis/inferred_section"] } }) logging.info("About to delete %d entries" % edb.get_analysis_timeseries_db().find(del_query).count()) logging.info("About to delete entries with keys %s" % edb.get_analysis_timeseries_db().find(del_query).distinct( "metadata.key")) del_pipeline_query = { "pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value } logging.info("About to delete pipeline entries for stage %s" % ecwp.PipelineStages.MODE_INFERENCE) if is_dry_run: logging.info( "this is a dry-run, returning from del_objects_after without modifying anything" ) else: result = edb.get_analysis_timeseries_db().delete_many(del_query) logging.info( "this is not a dry-run, result of deleting analysis entries is %s" % result.raw_result) result = edb.get_pipeline_state_db().delete_many(del_pipeline_query) logging.info( "this is not a dry-run, result of deleting pipeline state is %s" % result.raw_result)
def clearRelatedDb(self): edb.get_timeseries_db().remove({"user_id": {"$in": self.testUUIDList}}) edb.get_analysis_timeseries_db().remove( {"user_id": { "$in": self.testUUIDList }}) edb.get_usercache_db().remove({"user_id": {"$in": self.testUUIDList}})
def testReadWriteUser(self): try: rw_username = "******" rw_password = "******" self.admin_auth.command( { "createUser": rw_username, "pwd": rw_password, "roles": [ { "role": "readWrite", "db": "Stage_database" } ] } ) result = self.admin_auth.command({"usersInfo": rw_username}) self.assertEqual(result['ok'], 1.0) self.assertEqual(len(result['users']), 1) self.assertEqual(result['users'][0]['user'], rw_username) self.configureDB(self.getURL(rw_username, rw_password)) import emission.tests.storageTests.analysis_ts_common as etsa import emission.storage.decorations.analysis_timeseries_queries as esda import emission.core.wrapper.rawplace as ecwrp import emission.storage.timeseries.abstract_timeseries as esta ts = esta.TimeSeries.get_time_series(self.uuid) etsa.createNewPlaceLike(self, esda.RAW_PLACE_KEY, ecwrp.Rawplace) inserted_df = ts.get_data_df(esda.RAW_PLACE_KEY) self.assertEqual(len(inserted_df), 1) self.assertEqual(len(ts.get_data_df(esda.CLEANED_PLACE_KEY)), 0) finally: import emission.core.get_database as edb edb.get_analysis_timeseries_db().delete_many({'user_id': self.testUserId})
def clearRelatedDb(self): edb.get_timeseries_db().delete_many({"user_id": self.testUUID}) edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID}) edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID}) edb.get_timeseries_db().delete_many({"user_id": self.testUUID1}) edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID1}) edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID1})
def del_objects_after(user_id, reset_ts, is_dry_run): del_query = {} # handle the user del_query.update({"user_id": user_id}) date_query_list = [] # handle all trip-like entries date_query_list.append({"data.start_ts": {"$gt": reset_ts}}) # handle all place-like entries date_query_list.append({"data.enter_ts": {"$gt": reset_ts}}) # handle all reconstructed points date_query_list.append({"data.ts": {"$gt": reset_ts}}) del_query.update({"$or": date_query_list}) logging.debug("After all updates, del_query = %s" % del_query) logging.info("About to delete %d entries" % edb.get_analysis_timeseries_db().find(del_query).count()) logging.info("About to delete entries with keys %s" % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key")) if is_dry_run: logging.info("this is a dry-run, returning from del_objects_after without modifying anything") else: result = edb.get_analysis_timeseries_db().remove(del_query) logging.info("this is not a dry-run, result of deleting analysis entries is %s" % result)
def clearRelatedDb(self): edb.get_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_pipeline_state_db().remove({"user_id": self.androidUUID}) edb.get_timeseries_db().remove({"user_id": self.iosUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.iosUUID}) edb.get_pipeline_state_db().remove({"user_id": self.iosUUID})
def reset_last_place(last_place, is_dry_run): if is_dry_run: logging.info("this is a dry-run, returning from reset_last_place without modifying anything" ) return match_query = {"_id": last_place['_id']} logging.debug("match query = %s" % match_query) # Note that we need to reset the raw_place array # since it will be repopulated with new squished places # when the timeline after the _entry_ to this place is reconstructed # Note that # "If the field does not exist, then $unset does nothing (i.e. no # operation).", so this is still OK. reset_query = {'$unset' : {"data.exit_ts": "", "data.exit_local_dt": "", "data.exit_fmt_time": "", "data.starting_trip": "", "data.duration": "" }} if last_place.metadata.key == esda.CLEANED_PLACE_KEY: reset_query.update({"$set": {"data.raw_places": []}}) logging.debug("reset_query = %s" % reset_query) result = edb.get_analysis_timeseries_db().update(match_query, reset_query) logging.debug("this is not a dry run, result of update in reset_last_place = %s" % result) logging.debug("after update, entry is %s" % edb.get_analysis_timeseries_db().find_one(match_query))
def tearDown(self): os.remove(self.analysis_conf_path) edb.get_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_timeseries_db().remove({"user_id": self.iosUUID}) edb.get_pipeline_state_db().remove({"user_id": self.androidUUID}) edb.get_pipeline_state_db().remove({"user_id": self.iosUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.iosUUID})
def clearRelatedDb(self): edb.get_timeseries_db().delete_many({"user_id": self.testUUID}) edb.get_analysis_timeseries_db().delete_many( {"user_id": self.testUUID}) edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID}) edb.get_timeseries_db().delete_many({"user_id": self.testUUID1}) edb.get_analysis_timeseries_db().delete_many( {"user_id": self.testUUID1}) edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID1})
def testReadOnlyUser(self): try: ro_username = "******" ro_password = "******" self.stagedb_auth = pymongo.MongoClient(self.getURL(self.test_username, self.test_password)).Stage_database self.stagedb_auth.command( { "createRole": "createIndex", "privileges": [ { "resource": { "db": "Stage_database", "collection": "" }, "actions": [ "createIndex"] } ], "roles": [] } ) role_result = self.stagedb_auth.command({ "rolesInfo": 1, "showBuiltinRoles": False, "showPrivileges": True}) logging.debug("role_result = %s" % role_result) self.assertEqual(role_result['ok'], 1.0) self.assertEqual(len(role_result['roles']), 1) self.assertEqual(role_result['roles'][0]['role'], "createIndex") self.assertEqual(role_result['roles'][0]['db'], "Stage_database") self.assertEqual(len(role_result['roles'][0]['privileges']), 1) self.assertEqual(role_result['roles'][0]['privileges'][0]["actions"], ["createIndex"]) self.admin_auth.command( { "createUser": ro_username, "pwd": ro_password, "roles": [ { "role": "read", "db": "Stage_database" }, { "role": "createIndex", "db": "Stage_database"} ] } ) result = self.admin_auth.command({"usersInfo": ro_username}) self.assertEqual(result['ok'], 1.0) self.assertEqual(len(result['users']), 1) self.assertEqual(result['users'][0]['user'], ro_username) self.configureDB(self.getURL(ro_username, ro_password)) import emission.tests.storageTests.analysis_ts_common as etsa import emission.storage.decorations.analysis_timeseries_queries as esda import emission.core.wrapper.rawplace as ecwrp import emission.storage.timeseries.abstract_timeseries as esta ts = esta.TimeSeries.get_time_series(self.uuid) with self.assertRaises(pymongo.errors.OperationFailure): etsa.createNewPlaceLike(self, esda.RAW_PLACE_KEY, ecwrp.Rawplace) inserted_df = ts.get_data_df(esda.RAW_PLACE_KEY) self.assertEqual(len(inserted_df), 0) self.assertEqual(len(ts.get_data_df(esda.CLEANED_PLACE_KEY)), 0) finally: import emission.core.get_database as edb with self.assertRaises(pymongo.errors.OperationFailure): edb.get_analysis_timeseries_db().delete_many({'user_id': self.testUserId}) self.stagedb_auth.command({"dropAllRolesFromDatabase": 1})
def savePlaceLike(utest, key, wrapper): new_place = createNewPlaceLike(utest, key, wrapper) utest.assertEqual(edb.get_analysis_timeseries_db().count_documents( {"metadata.key": key, "data.exit_ts": 6}), 1) utest.assertEqual(edb.get_analysis_timeseries_db().find_one( {"metadata.key": key, "data.exit_ts": 6})["_id"], new_place.get_id()) utest.assertEqual(edb.get_analysis_timeseries_db().find_one( {"metadata.key": key, "data.exit_ts": 6})["user_id"], utest.testUserId) return new_place
def _get_sections_for_query(section_query, sort_field): section_query.update({"metadata.key": "segmentation/raw_section"}) logging.debug("Returning sections for query %s" % section_query) section_doc_cursor = edb.get_analysis_timeseries_db().find( section_query).sort(sort_field, pymongo.ASCENDING) logging.debug( "result length = %d" % edb.get_analysis_timeseries_db().count_documents(section_query)) return [ecwe.Entry(doc) for doc in section_doc_cursor]
def saveTripLike(utest, key, wrapper): new_trip = createNewTripLike(utest, key, wrapper) utest.assertEqual(edb.get_analysis_timeseries_db().find( {"metadata.key": key, "data.end_ts": 6}).count(), 1) utest.assertEqual(edb.get_analysis_timeseries_db().find_one( {"metadata.key": key, "data.end_ts": 6})["_id"], new_trip.get_id()) utest.assertEqual(edb.get_analysis_timeseries_db().find_one( {"metadata.key": key, "data.end_ts": 6})["user_id"], utest.testUserId) return new_trip
def _get_stops_for_query(stop_query, sort_key): logging.debug("Returning stops for query %s" % stop_query) stop_query.update({"metadata.key": "segmentation/raw_stop"}) logging.debug("updated query = %s" % stop_query) stop_doc_cursor = edb.get_analysis_timeseries_db().find(stop_query).sort( sort_key, pymongo.ASCENDING) logging.debug("result count = %d" % edb.get_analysis_timeseries_db().count_documents(stop_query)) return [ecwe.Entry(doc) for doc in stop_doc_cursor]
def savePlaceLike(utest, key, wrapper): new_place = createNewPlaceLike(utest, key, wrapper) utest.assertEqual(edb.get_analysis_timeseries_db().find( {"metadata.key": key, "data.exit_ts": 6}).count(), 1) utest.assertEqual(edb.get_analysis_timeseries_db().find_one( {"metadata.key": key, "data.exit_ts": 6})["_id"], new_place.get_id()) utest.assertEqual(edb.get_analysis_timeseries_db().find_one( {"metadata.key": key, "data.exit_ts": 6})["user_id"], utest.testUserId) return new_place
def del_objects(args): del_query = {} if args.user_id != "all": del_query['user_id'] = uuid.UUID(args.user_id) if args.date is None: print("Deleting all analysis information for query %s" % del_query) print edb.get_analysis_timeseries_db().remove(del_query) print edb.get_common_place_db().remove(del_query) print edb.get_common_trip_db().remove(del_query)
def clearRelatedDb(self): edb.get_timeseries_db().delete_many( {"user_id": { "$in": self.testUUIDList }}) edb.get_analysis_timeseries_db().delete_many( {"user_id": { "$in": self.testUUIDList }}) edb.get_usercache_db().delete_many( {"user_id": { "$in": self.testUUIDList }}) edb.get_uuid_db().delete_many({"user_id": {"$in": self.testUUIDList}})
def purge_entries_for_user(curr_uuid, is_purge_state, db_array=None): logging.info("For uuid = %s, deleting entries from the timeseries" % curr_uuid) if db_array is not None: [ts_db, ats_db, udb, psdb] = db_array logging.debug("db_array passed in with databases %s" % db_array) else: import emission.core.get_database as edb ts_db = edb.get_timeseries_db() ats_db = edb.get_analysis_timeseries_db() udb = edb.get_uuid_db() psdb = edb.get_pipeline_state_db() logging.debug("db_array not passed in, looking up databases") timeseries_del_result = ts_db.remove({"user_id": curr_uuid}) logging.info("result = %s" % timeseries_del_result) logging.info("For uuid = %s, deleting entries from the analysis_timeseries" % curr_uuid) analysis_timeseries_del_result = ats_db.remove({"user_id": curr_uuid}) logging.info("result = %s" % analysis_timeseries_del_result) logging.info("For uuid %s, deleting entries from the user_db" % curr_uuid) user_db_del_result = udb.remove({"uuid": curr_uuid}) logging.info("result = %s" % user_db_del_result) if is_purge_state: logging.info("For uuid %s, deleting entries from the pipeline_state_db" % curr_uuid) psdb_del_result = psdb.remove({"user_id": curr_uuid}) logging.info("result = %s" % psdb_del_result)
def __init__(self, user_id): super(BuiltinTimeSeries, self).__init__(user_id) self.key_query = lambda(key): {"metadata.key": key} self.type_query = lambda(entry_type): {"metadata.type": entry_type} self.user_query = {"user_id": self.user_id} # UUID is mandatory for this version self.timeseries_db = edb.get_timeseries_db() self.analysis_timeseries_db = edb.get_analysis_timeseries_db() self.ts_map = { "background/location": self.timeseries_db, "background/filtered_location": self.timeseries_db, "background/motion_activity": self.timeseries_db, "background/battery": self.timeseries_db, "statemachine/transition": self.timeseries_db, "config/sensor_config": self.timeseries_db, "segmentation/raw_trip": self.analysis_timeseries_db, "segmentation/raw_place": self.analysis_timeseries_db, "segmentation/raw_section": self.analysis_timeseries_db, "segmentation/raw_stop": self.analysis_timeseries_db, "analysis/smoothing": self.analysis_timeseries_db, "analysis/cleaned_trip": self.analysis_timeseries_db, "analysis/cleaned_place": self.analysis_timeseries_db, "analysis/cleaned_section": self.analysis_timeseries_db, "analysis/cleaned_stop": self.analysis_timeseries_db, "analysis/recreated_location": self.analysis_timeseries_db, }
def purge_entries_for_user(curr_uuid, is_purge_state, db_array=None): logging.info("For uuid = %s, deleting entries from the timeseries" % curr_uuid) if db_array is not None: [ts_db, ats_db, udb, psdb] = db_array logging.debug("db_array passed in with databases %s" % db_array) else: import emission.core.get_database as edb ts_db = edb.get_timeseries_db() ats_db = edb.get_analysis_timeseries_db() udb = edb.get_uuid_db() psdb = edb.get_pipeline_state_db() logging.debug("db_array not passed in, looking up databases") timeseries_del_result = ts_db.remove({"user_id": curr_uuid}) logging.info("result = %s" % timeseries_del_result) logging.info( "For uuid = %s, deleting entries from the analysis_timeseries" % curr_uuid) analysis_timeseries_del_result = ats_db.remove({"user_id": curr_uuid}) logging.info("result = %s" % analysis_timeseries_del_result) logging.info("For uuid %s, deleting entries from the user_db" % curr_uuid) user_db_del_result = udb.remove({"uuid": curr_uuid}) logging.info("result = %s" % user_db_del_result) if is_purge_state: logging.info( "For uuid %s, deleting entries from the pipeline_state_db" % curr_uuid) psdb_del_result = psdb.remove({"user_id": curr_uuid}) logging.info("result = %s" % psdb_del_result)
def post_check(unique_user_list, all_rerun_list): import emission.core.get_database as edb import numpy as np logging.info( "For %s users, loaded %s raw entries, %s processed entries and %s pipeline states" % (len(unique_user_list), edb.get_timeseries_db().count_documents( {"user_id": { "$in": list(unique_user_list) }}), edb.get_analysis_timeseries_db().count_documents( {"user_id": { "$in": list(unique_user_list) }}), edb.get_pipeline_state_db().count_documents({ "user_id": { "$in": list(unique_user_list) } }))) all_rerun_arr = np.array(all_rerun_list) # want to check if no entry needs a rerun? In this case we are done # no entry needs a rerun = all entries are false, not(all entries) are true if np.all(np.logical_not(all_rerun_list)): logging.info( "all entries in the timeline contain analysis results, no need to run the intake pipeline" ) # if all entries need to be re-run, we must have had raw data throughout elif np.all(all_rerun_list): logging.info( "all entries in the timeline contain only raw data, need to run the intake pipeline" ) else: logging.info( "timeline contains a mixture of analysis results and raw data - complain to shankari!" )
def _get_sections_for_query(section_query, sort_field): section_query.update({"metadata.key": "segmentation/raw_section"}) logging.debug("Returning sections for query %s" % section_query) section_doc_cursor = edb.get_analysis_timeseries_db().find( section_query).sort(sort_field, pymongo.ASCENDING) logging.debug("result cursor length = %d" % section_doc_cursor.count()) return [ecwe.Entry(doc) for doc in section_doc_cursor]
def export_timeline(user_id_str, start_day_str, end_day_str, file_name): logging.info( "Extracting trips for user %s day %s -> %s and saving to file %s" % (user_id_str, start_day_str, end_day_str, file)) # day_dt = pydt.datetime.strptime(day_str, "%Y-%m-%d").date() start_day_dt = pydt.datetime.strptime(start_day_str, "%Y-%m-%d") end_day_dt = pydt.datetime.strptime(end_day_str, "%Y-%m-%d") logging.debug("start_day_dt is %s, end_day_dt is %s" % (start_day_dt, end_day_dt)) # TODO: Convert to call to get_timeseries once we get that working # Or should we even do that? query = { 'user_id': uuid.UUID(user_id_str), 'start_local_dt': { '$gt': start_day_dt, "$lt": end_day_dt } } print("query = %s" % query) entry_list = list(edb.get_analysis_timeseries_db().find(query)) logging.info("Found %d entries" % len(entry_list)) json.dump(entry_list, open(file_name, "w"), default=bju.default, allow_nan=False, indent=4)
def __init__(self, user_id): super(BuiltinTimeSeries, self).__init__(user_id) self.key_query = lambda (key): {"metadata.key": key} self.type_query = lambda (entry_type): {"metadata.type": entry_type} self.user_query = { "user_id": self.user_id } # UUID is mandatory for this version self.timeseries_db = edb.get_timeseries_db() self.analysis_timeseries_db = edb.get_analysis_timeseries_db() self.ts_map = { "background/location": self.timeseries_db, "background/filtered_location": self.timeseries_db, "background/motion_activity": self.timeseries_db, "background/battery": self.timeseries_db, "statemachine/transition": self.timeseries_db, "config/sensor_config": self.timeseries_db, "segmentation/raw_trip": self.analysis_timeseries_db, "segmentation/raw_place": self.analysis_timeseries_db, "segmentation/raw_section": self.analysis_timeseries_db, "segmentation/raw_stop": self.analysis_timeseries_db, "analysis/smoothing": self.analysis_timeseries_db, "analysis/cleaned_trip": self.analysis_timeseries_db, "analysis/cleaned_place": self.analysis_timeseries_db, "analysis/cleaned_section": self.analysis_timeseries_db, "analysis/cleaned_stop": self.analysis_timeseries_db, "analysis/recreated_location": self.analysis_timeseries_db, }
def _get_stops_for_query(stop_query, sort_key): logging.debug("Returning stops for query %s" % stop_query) stop_query.update({"metadata.key": "segmentation/raw_stop"}) logging.debug("updated query = %s" % stop_query) stop_doc_cursor = edb.get_analysis_timeseries_db().find(stop_query).sort( sort_key, pymongo.ASCENDING) logging.debug("result count = %d" % stop_doc_cursor.count()) return [ecwe.Entry(doc) for doc in stop_doc_cursor]
def get_stops_for_trip(key, user_id, trip_id): """ Get the set of sections that are children of this trip. """ query = {"user_id": user_id, "data.trip_id": trip_id, "metadata.key": key} logging.debug("About to execute query %s with sort_key %s" % (query, "data.enter_ts")) stop_doc_cursor = edb.get_analysis_timeseries_db().find(query).sort( "data.enter_ts", pymongo.ASCENDING) return [ecwe.Entry(doc) for doc in stop_doc_cursor]
def get_stops_for_trip(key, user_id, trip_id): """ Get the set of sections that are children of this trip. """ query = {"user_id": user_id, "data.trip_id": trip_id, "metadata.key": key} logging.debug("About to execute query %s with sort_key %s" % (query, "data.enter_ts")) stop_doc_cursor = edb.get_analysis_timeseries_db().find(query).sort( "data.enter_ts", pymongo.ASCENDING) return [ecwe.Entry(doc) for doc in stop_doc_cursor]
def del_all_objects(is_dry_run): del_query = {} del_query.update({"metadata.key": {"$in": ["inference/prediction", "analysis/inferred_section"]}}) logging.info("About to delete %d entries" % edb.get_analysis_timeseries_db().find(del_query).count()) logging.info("About to delete entries with keys %s" % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key")) del_pipeline_query = {"pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value} logging.info("About to delete pipeline entries for stage %s" % ecwp.PipelineStages.MODE_INFERENCE) if is_dry_run: logging.info("this is a dry-run, returning from del_objects_after without modifying anything") else: result = edb.get_analysis_timeseries_db().delete_many(del_query) logging.info("this is not a dry-run, result of deleting analysis entries is %s" % result.raw_result) result = edb.get_pipeline_state_db().delete_many(del_pipeline_query) logging.info("this is not a dry-run, result of deleting pipeline state is %s" % result.raw_result)
def get_sections_for_trip(key, user_id, trip_id): # type: (UUID, object_id) -> list(sections) """ Get the set of sections that are children of this trip. """ query = {"user_id": user_id, "data.trip_id": trip_id, "metadata.key": key} section_doc_cursor = edb.get_analysis_timeseries_db().find(query).sort( "data.start_ts", pymongo.ASCENDING) logging.debug("About to execute query %s" % query) return [ecwe.Entry(doc) for doc in section_doc_cursor]
def _del_entries_for_query(del_query, is_dry_run): """ This is much easier. The steps are: - delete all analysis objects for this user - delete all pipeline states for this user """ logging.info("About to delete %s analysis results" % edb.get_analysis_timeseries_db().find(del_query).count()) logging.info("About to delete entries with keys %s" % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key")) logging.info("About to delete %s pipeline states" % (edb.get_pipeline_state_db().find(del_query).count())) if is_dry_run: logging.info("this is a dry run, returning from reset_user_to-start without modifying anything") else: result = edb.get_analysis_timeseries_db().remove(del_query) logging.info("this is not a dry run, result of removing analysis objects = %s" % result) result = edb.get_pipeline_state_db().remove(del_query) logging.info("this is not a dry run, result of removing pipeline states = %s" % result)
def move_ts_entries(key): tdb = edb.get_timeseries_db() atdb = edb.get_analysis_timeseries_db() result_cursor = tdb.find({'metadata.key': key}) logging.info("About to convert %s entries" % result_cursor.count()) for i, entry_doc in enumerate(result_cursor): if i % 10000 == 0: print "moved %s from one ts to the other" % (entry_doc) atdb.insert(entry_doc) tdb.remove(entry_doc)
def testReadWriteUser(self): try: rw_username = "******" rw_password = "******" self.admin_auth.command({ "createUser": rw_username, "pwd": rw_password, "roles": [{ "role": "readWrite", "db": "Stage_database" }] }) result = self.admin_auth.command({"usersInfo": rw_username}) self.assertEqual(result['ok'], 1.0) self.assertEqual(len(result['users']), 1) self.assertEqual(result['users'][0]['user'], rw_username) self.configureDB(self.getURL(rw_username, rw_password)) import emission.tests.storageTests.analysis_ts_common as etsa import emission.storage.decorations.analysis_timeseries_queries as esda import emission.core.wrapper.rawplace as ecwrp import emission.storage.timeseries.abstract_timeseries as esta ts = esta.TimeSeries.get_time_series(self.uuid) etsa.createNewPlaceLike(self, esda.RAW_PLACE_KEY, ecwrp.Rawplace) inserted_df = ts.get_data_df(esda.RAW_PLACE_KEY) self.assertEqual(len(inserted_df), 1) self.assertEqual(len(ts.get_data_df(esda.CLEANED_PLACE_KEY)), 0) finally: import emission.core.get_database as edb edb.get_analysis_timeseries_db().delete_many( {'user_id': self.testUserId})
def export_timeline(user_id_str, start_day_str, end_day_str, file_name): logging.info("Extracting trips for user %s day %s -> %s and saving to file %s" % (user_id_str, start_day_str, end_day_str, file)) # day_dt = pydt.datetime.strptime(day_str, "%Y-%m-%d").date() start_day_dt = pydt.datetime.strptime(start_day_str, "%Y-%m-%d") end_day_dt = pydt.datetime.strptime(end_day_str, "%Y-%m-%d") logging.debug("start_day_dt is %s, end_day_dt is %s" % (start_day_dt, end_day_dt)) # TODO: Convert to call to get_timeseries once we get that working # Or should we even do that? query = {'user_id': uuid.UUID(user_id_str), 'start_local_dt': {'$gt': start_day_dt, "$lt": end_day_dt}} print("query = %s" % query) entry_list = list(edb.get_analysis_timeseries_db().find(query)) logging.info("Found %d entries" % len(entry_list)) json.dump(entry_list, open(file_name, "w"), default=bju.default, allow_nan=False, indent=4)
def move_ts_entries(key): tdb = edb.get_timeseries_db() atdb = edb.get_analysis_timeseries_db() result_cursor = tdb.find({'metadata.key': key}) logging.info("About to convert %s entries" % result_cursor.count()) for i, entry_doc in enumerate(result_cursor): try: if i % 10000 == 0: print("moved %s from one ts to the other" % (entry_doc)) atdb.insert(entry_doc) # tdb.remove(entry_doc) except: logging.info("Got error while moving %s, skipping" % (entry_doc))
def _get_inference_entry_for_section(user_id, section_id, entry_key, section_id_key): prediction_key_query = {"metadata.key": entry_key} inference_query = {"user_id": user_id, section_id_key: section_id} combo_query = copy.copy(prediction_key_query) combo_query.update(inference_query) logging.debug("About to query %s" % combo_query) ret_list = list(edb.get_analysis_timeseries_db().find(combo_query)) # We currently have only one algorithm assert len(ret_list) <= 1, "Found len(ret_list) = %d, expected <=1" % len(ret_list) if len(ret_list) == 0: logging.debug("Found no inferred prediction, returning None") return None assert len(ret_list) == 1, "Found ret_list of length %d, expected 1" % len(ret_list) curr_prediction = ecwe.Entry(ret_list[0]) return curr_prediction
def clearRelatedDb(self): logging.info( "Timeseries delete result %s" % edb.get_timeseries_db().delete_many({ "user_id": self.testUUID }).raw_result) logging.info("Analysis delete result %s" % edb.get_analysis_timeseries_db().delete_many({ "user_id": self.testUUID }).raw_result) logging.info( "Usercache delete result %s" % edb.get_usercache_db().delete_many({ "user_id": self.testUUID }).raw_result)
def del_objects(args): del_query = {} if args.user_id != "all": del_query['user_id'] = uuid.UUID(args.user_id) trip_query = copy.copy(del_query) trip_query.update({ "metadata.key": { "$in": [ "segmentation/raw_trip", "analysis/cleaned_trip", "segmentation/raw_section", "analysis/cleaned_section" ] } }) place_query = copy.copy(del_query) place_query.update({ "metadata.key": { "$in": [ "segmentation/raw_place", "analysis/cleaned_place", "segmentation/raw_stop", "analysis/cleaned_stop" ] } }) point_query = copy.copy(del_query) point_query.update( {"metadata.key": { "$in": ["analysis/recreated_location"] }}) if args.date is None: logging.debug("no date specified, deleting everything") else: day_dt = pydt.datetime.strptime(args.date, "%Y-%m-%d") logging.debug("day_dt is %s" % day_dt) day_ts = time.mktime(day_dt.timetuple()) logging.debug("day_ts is %s" % day_ts) trip_query.update({"data.start_ts": {"$gt": day_ts}}) place_query.update({"data.exit_ts": {"$gt": day_ts}}) point_query.update({"data.ts": {"$gt": day_ts}}) print "trip_query = %s" % trip_query print "place_query = %s" % place_query print "point_query = %s" % point_query # Since sections have the same basic structure as trips and stops have the # same basic structure as places, we can reuse the queries print "Deleting trips/sections for %s after %s" % (args.user_id, args.date) print edb.get_analysis_timeseries_db().remove(trip_query) print "Deleting places/stops for %s after %s" % (args.user_id, args.date) print edb.get_analysis_timeseries_db().remove(place_query) print "Deleting points for %s after %s" % (args.user_id, args.date) print edb.get_analysis_timeseries_db().remove(point_query)
def _get_inference_entry_for_section(user_id, section_id, entry_key, section_id_key): prediction_key_query = {"metadata.key": entry_key} inference_query = {"user_id": user_id, section_id_key: section_id} combo_query = copy.copy(prediction_key_query) combo_query.update(inference_query) logging.debug("About to query %s" % combo_query) ret_list = list(edb.get_analysis_timeseries_db().find(combo_query)) # We currently have only one algorithm assert len(ret_list ) <= 1, "Found len(ret_list) = %d, expected <=1" % len(ret_list) if len(ret_list) == 0: logging.debug("Found no inferred prediction, returning None") return None assert len(ret_list) == 1, "Found ret_list of length %d, expected 1" % len( ret_list) curr_prediction = ecwe.Entry(ret_list[0]) return curr_prediction
def post_check(unique_user_list, all_rerun_list): import emission.core.get_database as edb import numpy as np logging.info("For %s users, loaded %s raw entries and %s processed entries" % (len(unique_user_list), edb.get_timeseries_db().find({"user_id": {"$in": list(unique_user_list)}}).count(), edb.get_analysis_timeseries_db().find({"user_id": {"$in": list(unique_user_list)}}).count())) all_rerun_arr = np.array(all_rerun_list) # want to check if no entry needs a rerun? In this case we are done # no entry needs a rerun = all entries are false, not(all entries) are true if np.all(np.logical_not(all_rerun_list)): logging.info("all entries in the timeline contain analysis results, no need to run the intake pipeline") # if all entries need to be re-run, we must have had raw data throughout elif np.all(all_rerun_list): logging.info("all entries in the timeline contain only raw data, need to run the intake pipeline") else: logging.info("timeline contains a mixture of analysis results and raw data - complain to shankari!")
def del_objects(args): del_query = {} if args.user_id != "all": del_query['user_id'] = uuid.UUID(args.user_id) trip_query = copy.copy(del_query) trip_query.update({"metadata.key": { "$in": ["segmentation/raw_trip", "analysis/cleaned_trip", "segmentation/raw_section", "analysis/cleaned_section"]}}) place_query = copy.copy(del_query) place_query.update({"metadata.key": { "$in": ["segmentation/raw_place", "analysis/cleaned_place", "segmentation/raw_stop", "analysis/cleaned_stop"]}}) point_query = copy.copy(del_query) point_query.update({"metadata.key": { "$in": ["analysis/recreated_location"]}}) if args.date is None: logging.debug("no date specified, deleting everything") else: day_dt = pydt.datetime.strptime(args.date, "%Y-%m-%d") logging.debug("day_dt is %s" % day_dt) day_ts = time.mktime(day_dt.timetuple()) logging.debug("day_ts is %s" % day_ts) trip_query.update({"data.start_ts": {"$gt": day_ts}}) place_query.update({"data.exit_ts": {"$gt": day_ts}}) point_query.update({"data.ts": {"$gt": day_ts}}) print "trip_query = %s" % trip_query print "place_query = %s" % place_query print "point_query = %s" % point_query # Since sections have the same basic structure as trips and stops have the # same basic structure as places, we can reuse the queries print "Deleting trips/sections for %s after %s" % (args.user_id, args.date) print edb.get_analysis_timeseries_db().remove(trip_query) print "Deleting places/stops for %s after %s" % (args.user_id, args.date) print edb.get_analysis_timeseries_db().remove(place_query) print "Deleting points for %s after %s" % (args.user_id, args.date) print edb.get_analysis_timeseries_db().remove(point_query)
def clearRelevantSections(self): edb.get_analysis_timeseries_db().drop()
from future import standard_library standard_library.install_aliases() from builtins import * import logging import pandas as pd import pymongo import itertools import emission.core.get_database as edb import emission.storage.timeseries.abstract_timeseries as esta import emission.core.wrapper.entry as ecwe ts_enum_map = { esta.EntryType.DATA_TYPE: edb.get_timeseries_db(), esta.EntryType.ANALYSIS_TYPE: edb.get_analysis_timeseries_db() } class BuiltinTimeSeries(esta.TimeSeries): def __init__(self, user_id): super(BuiltinTimeSeries, self).__init__(user_id) self.key_query = lambda key: {"metadata.key": key} self.type_query = lambda entry_type: {"metadata.type": entry_type} self.user_query = {"user_id": self.user_id} # UUID is mandatory for this version self.timeseries_db = ts_enum_map[esta.EntryType.DATA_TYPE] self.analysis_timeseries_db = ts_enum_map[esta.EntryType.ANALYSIS_TYPE] # Design question: Should the stats be a separate database, or should it be part # of the timeseries database? Technically, it should be part of the timeseries # database. However, I am concerned about the performance of the database # with even more entries - it already takes 10 seconds to query for a document # and I am not sure that adding a ton more data is going to make that better
def tearDown(self): edb.get_analysis_timeseries_db().remove({'user_id': self.testUUID})
def clearRelatedDb(self): edb.get_timeseries_db().remove({"user_id": {"$in": self.testUUIDList}}) edb.get_analysis_timeseries_db().remove({"user_id": {"$in": self.testUUIDList}}) edb.get_usercache_db().remove({"user_id": {"$in": self.testUUIDList}})
def setUp(self): self.testUserId = uuid.uuid3(uuid.NAMESPACE_URL, "mailto:[email protected]") edb.get_analysis_timeseries_db().remove({'user_id': self.testUserId})
def clearRelatedDb(self): edb.get_timeseries_db().delete_many({"user_id": self.testUUID}) edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID}) edb.get_usercache_db().delete_many({"user_id": self.testUUID})
def clearRelatedDb(self): edb.get_timeseries_db().remove({"user_id": self.testUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.testUUID})
def clearRelatedDb(self): edb.get_timeseries_db().remove({"user_id": self.testUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.testUUID})