def export_timeline(user_id, start_day_str, end_day_str, file_name): logging.info("Extracting timeline for user %s day %s -> %s and saving to file %s" % (user_id, start_day_str, end_day_str, file_name)) # day_dt = pydt.datetime.strptime(day_str, "%Y-%m-%d").date() start_day_ts = arrow.get(start_day_str).timestamp end_day_ts = arrow.get(end_day_str).timestamp logging.debug("start_day_ts = %s (%s), end_day_ts = %s (%s)" % (start_day_ts, arrow.get(start_day_ts), end_day_ts, arrow.get(end_day_ts))) ts = esta.TimeSeries.get_time_series(user_id) loc_time_query = estt.TimeQuery("data.ts", start_day_ts, end_day_ts) loc_entry_list = list(estcs.find_entries(user_id, key_list=None, time_query=loc_time_query)) ma_time_query = estt.TimeQuery("metadata.write_ts", start_day_ts, end_day_ts) ma_entry_list = list(estcs.find_entries(user_id, key_list=["background/motion_activity"], time_query=ma_time_query)) trip_time_query = estt.TimeQuery("data.start_ts", start_day_ts, end_day_ts) trip_entry_list = list(ts.find_entries(key_list=None, time_query=trip_time_query)) place_time_query = estt.TimeQuery("data.enter_ts", start_day_ts, end_day_ts) place_entry_list = list(ts.find_entries(key_list=None, time_query=place_time_query)) # Handle the case of the first place, which has no enter_ts and won't be # matched by the default query first_place_extra_query = {'$and': [{'data.enter_ts': {'$exists': False}}, {'data.exit_ts': {'$exists': True}}]} first_place_entry_list = list(ts.find_entries(key_list=None, time_query=None, extra_query_list=[first_place_extra_query])) logging.info("First place entry list = %s" % first_place_entry_list) combined_list = loc_entry_list + ma_entry_list + trip_entry_list + place_entry_list + first_place_entry_list logging.info("Found %d loc entries, %d motion entries, %d trip-like entries, %d place-like entries = %d total entries" % (len(loc_entry_list), len(ma_entry_list), len(trip_entry_list), len(place_entry_list), len(combined_list))) validate_truncation(loc_entry_list, trip_entry_list, place_entry_list) unique_key_list = set([e["metadata"]["key"] for e in combined_list]) logging.info("timeline has unique keys = %s" % unique_key_list) if len(combined_list) == 0 or unique_key_list == set(['stats/pipeline_time']): logging.info("No entries found in range for user %s, skipping save" % user_id) else: # Also dump the pipeline state, since that's where we have analysis results upto # This allows us to copy data to a different *live system*, not just # duplicate for analysis combined_filename = "%s_%s.gz" % (file_name, user_id) with gzip.open(combined_filename, "wt") as gcfd: json.dump(combined_list, gcfd, default=bju.default, allow_nan=False, indent=4) import emission.core.get_database as edb pipeline_state_list = list(edb.get_pipeline_state_db().find({"user_id": user_id})) logging.info("Found %d pipeline states %s" % (len(pipeline_state_list), list([ps["pipeline_stage"] for ps in pipeline_state_list]))) pipeline_filename = "%s_pipelinestate_%s.gz" % (file_name, user_id) with gzip.open(pipeline_filename, "wt") as gpfd: json.dump(pipeline_state_list, gpfd, default=bju.default, allow_nan=False, indent=4)
def incident_heatmap(user_uuid, modes, time_query, region): """ Return a list of geojson points with properties for the time and the stress level related to incidents. This should not return full entries because that can expose the user_id in the aggregate case. Maybe it can return the data part only? Or should we put the other entries into the properties? :param modes: The modes that we want to query for :param time_query: The time query, in either local date or timestamp :param region: The region of interest :return: list of `incident` objects, with all metadata stripped out """ if region is None: geo_query = None else: geo_query = estg.GeoQuery(["data.loc"], region) extra_query_list = [] if modes is not None: mode_enum_list = [ecwm.MotionTypes[mode] for mode in modes] extra_query_list.append(esdlq.get_mode_query(mode_enum_list)) if user_uuid is None: incident_entry_list = esda.get_entries(MANUAL_INCIDENT_KEY, user_id=None, time_query=time_query, geo_query=geo_query, extra_query_list=extra_query_list) else: # We don't support aggregate queries on the usercache. And that is # actually fine, because we don't expect immediate results for the # aggregate case. We just want to query the usercache to ensure that # the incidents don't magically disappear just because they got pushed # to the server but are not yet processed incident_entry_list = estc.find_entries([MANUAL_INCIDENT_KEY], time_query) return {"incidents": [e.data for e in incident_entry_list]}
def get_user_input_from_cache_series(user_id, trip_obj, user_input_key): tq = estt.TimeQuery("data.start_ts", trip_obj.data.start_ts, trip_obj.data.end_ts) ts = esta.TimeSeries.get_time_series(user_id) potential_candidates = estsc.find_entries(user_id, [user_input_key], tq) return final_candidate(valid_user_input(ts, trip_obj), potential_candidates)
def getTimeseriesEntries(time_type): if 'user' not in request.json: abort(401, "only a user can read his/her data") user_uuid = getUUID(request) key_list = request.json['key_list'] if 'from_local_date' in request.json and 'to_local_date' in request.json: start_time = request.json['from_local_date'] end_time = request.json['to_local_date'] time_query = esttc.TimeComponentQuery("metadata.write_ts", start_time, end_time) else: start_time = request.json['start_time'] end_time = request.json['end_time'] time_query = estt.TimeQuery("metadata.write_ts", start_time, end_time) # Note that queries from usercache are limited to 100,000 entries # and entries from timeseries are limited to 250,000, so we will # return at most 350,000 entries. So this means that we don't need # additional filtering, but this should be documented in # the API data_list = esdc.find_entries(user_uuid, key_list, time_query) return {'phone_data': data_list}
def getPublicData(): ids = request.json['phone_ids'] all_uuids = map(lambda id: UUID(id), ids) uuids = [uuid for uuid in all_uuids if uuid in estag.TEST_PHONE_IDS] from_ts = request.query.from_ts to_ts = request.query.to_ts time_range = estt.TimeQuery("metadata.write_ts", float(from_ts), float(to_ts)) time_query = time_range.get_query() user_queries = map(lambda id: {'user_id': id}, uuids) for q in user_queries: q.update(time_query) num_entries_ts = map(lambda q: edb.get_timeseries_db().find(q).count(), user_queries) num_entries_uc = map(lambda q: edb.get_usercache_db().find(q).count(), user_queries) total_entries = sum(num_entries_ts + num_entries_uc) logging.debug("Total entries requested: %d" % total_entries) threshold = 200000 if total_entries > threshold: data_list = None else: data_list = map(lambda u: esdc.find_entries(u, None, time_range), all_uuids) return {'phone_data': data_list}
def getTimeseriesEntries(time_type): if 'user' not in request.json: abort(401, "only a user can read his/her data") user_uuid = getUUID(request) key_list = request.json['key_list'] if 'from_local_date' in request.json and 'to_local_date' in request.json: start_time = request.json['from_local_date'] end_time = request.json['to_local_date'] time_query = esttc.TimeComponentQuery("metadata.write_ts", start_time, end_time) else: start_time = request.json['start_time'] end_time = request.json['end_time'] time_query = estt.TimeQuery("metadata.write_ts", start_time, end_time) # Note that queries from usercache are limited to 100,000 entries # and entries from timeseries are limited to 250,000, so we will # return at most 350,000 entries. So this means that we don't need # additional filtering, but this should be documented in # the API data_list = esdc.find_entries(user_uuid, key_list, time_query) return {'phone_data': data_list}
def getPublicData(): ids = request.json['phone_ids'] all_uuids = map(lambda id: UUID(id), ids) uuids = [uuid for uuid in all_uuids if uuid in estag.TEST_PHONE_IDS] from_ts = request.query.from_ts to_ts = request.query.to_ts time_range = estt.TimeQuery("metadata.write_ts", float(from_ts), float(to_ts)) time_query = time_range.get_query() user_queries = map(lambda id: {'user_id': id}, uuids) for q in user_queries: q.update(time_query) num_entries_ts = map(lambda q: edb.get_timeseries_db().find(q).count(), user_queries) num_entries_uc = map(lambda q: edb.get_usercache_db().find(q).count(), user_queries) total_entries = sum(num_entries_ts + num_entries_uc) logging.debug("Total entries requested: %d" % total_entries) threshold = 200000 if total_entries > threshold: data_list = None else: data_list = map(lambda u: esdc.find_entries(u, None, time_range), all_uuids) return {'phone_data': data_list}
def get_user_input_from_cache_series(user_id, trip_obj, user_input_key): tq = estt.TimeQuery("data.start_ts", trip_obj.data.start_ts, trip_obj.data.end_ts) potential_candidates = estsc.find_entries(user_id, [user_input_key], tq) if len(potential_candidates) == 0: return None sorted_pc = sorted(potential_candidates, key=lambda c:c["metadata"]["write_ts"]) most_recent_entry = potential_candidates[-1] logging.debug("most recent entry has id %s" % most_recent_entry["_id"]) logging.debug("and is mapped to entry %s" % most_recent_entry) return ecwe.Entry(most_recent_entry)
def getTimeseriesEntries(time_type): if 'user' not in request.json: abort(401, "only a user can read his/her data") user_uuid = getUUID(request) key_list = request.json['key_list'] if 'from_local_date' in request.json and 'to_local_date' in request.json: start_time = request.json['from_local_date'] end_time = request.json['to_local_date'] time_key = request.json.get('key_local_date', 'metadata.write_ts') time_query = esttc.TimeComponentQuery(time_key, start_time, end_time) else: start_time = request.json['start_time'] end_time = request.json['end_time'] time_key = request.json.get('key_time', 'metadata.write_ts') time_query = estt.TimeQuery(time_key, start_time, end_time) # Note that queries from usercache are limited to 100,000 entries # and entries from timeseries are limited to 250,000, so we will # return at most 350,000 entries. So this means that we don't need # additional filtering, but this should be documented in # the API data_list = esdc.find_entries(user_uuid, key_list, time_query) if 'max_entries' in request.json: me = request.json['max_entries'] if (type(me) != int): logging.error("aborting: max entry count is %s, type %s, expected int" % (me, type(me))) abort(500, "Invalid max_entries %s" % me) if len(data_list) > me: if request.json['trunc_method'] == 'first': logging.debug("first n entries is %s" % me) data_list = data_list[:me] if request.json['trunc_method'] == 'last': logging.debug("first n entries is %s" % me) data_list = data_list[-me:] elif request.json["trunc_method"] == "sample": sample_rate = len(data_list)//me + 1 logging.debug("sampling rate is %s" % sample_rate) data_list = data_list[::sample_rate] else: logging.error("aborting: unexpected sampling method %s" % request.json["trunc_method"]) abort(500, "sampling method not specified while retriving limited data") else: logging.debug("Found %d entries < %s, no truncation" % (len(data_list), me)) logging.debug("successfully returning list of size %s" % len(data_list)) return {'phone_data': data_list}
def incident_heatmap(user_uuid, modes, time_query, region): """ Return a list of geojson points with properties for the time and the stress level related to incidents. This should not return full entries because that can expose the user_id in the aggregate case. Maybe it can return the data part only? Or should we put the other entries into the properties? :param modes: The modes that we want to query for :param time_query: The time query, in either local date or timestamp :param region: The region of interest :return: list of `incident` objects, with all metadata stripped out """ if region is None: geo_query = None else: geo_query = estg.GeoQuery(["data.loc"], region) extra_query_list = [] if modes is not None: mode_enum_list = [ecwm.MotionTypes[mode] for mode in modes] extra_query_list.append(esdlq.get_mode_query(mode_enum_list)) if user_uuid is None: incident_entry_list = esda.get_entries( MANUAL_INCIDENT_KEY, user_id=None, time_query=time_query, geo_query=geo_query, extra_query_list=extra_query_list) else: # We don't support aggregate queries on the usercache. And that is # actually fine, because we don't expect immediate results for the # aggregate case. We just want to query the usercache to ensure that # the incidents don't magically disappear just because they got pushed # to the server but are not yet processed incident_entry_list = estc.find_entries([MANUAL_INCIDENT_KEY], time_query) return {"incidents": [e.data for e in incident_entry_list]}