class InputStreamsAnalyzer(): """ This class is responsible for computing features based on streams of data derived from the smartphone sensors. """ def get_day_data(self, userid, day, stream_name, localtime=False): """ Return the filtered list of DataPoints according to the admission control provided :param List(DataPoint) data: Input data list 1 - Data is present and passes admission control 0 - No data present -1 - Data is present and fails admission control """ data = [] stream_ids = self.CC.get_stream_id(userid, stream_name) for stream_id in stream_ids: if stream_id is not None: ds = self.CC.get_stream(stream_id['identifier'], user_id=userid, day=day, localtime=localtime) if ds is not None: if ds.data is not None: data += ds.data if len(stream_ids) > 1: data = sorted(data, key=lambda x: x.start_time) return data def analyze_all_users(self, userids, alldays, config_path): x = 0 for usr in userids: print('Analyzing user %d %s' % (x, usr)) self.analyze_user(usr, alldays, config_path) x += 1 def analyze_user(self, userid, alldays, config_path): print(userid, alldays) self.CC = CerebralCortex(config_path) self.window_size = 3600 metadata = """ { "annotations":[], "data_descriptor":[ { "name":"total_datapoints", "type":"int", "description":"Total number of data points that are present in the input stream followed by an array of the corrupt datapoints", "stream_type": "sparse" } ], "execution_context":{ "processing_module":{ "name":"core.admission_control_marker.phone_stream_analyzer", "input_streams":[ { "name":"name", "identifier" : "id" } ] }, "algorithm":{ "method":"core.admission_control_marker", "authors":[ { "name":"Anand", "email":"*****@*****.**" } ], "version":"0.0.4", "description":"Analyzer for the phone input streams" } }, "name":"NAME_dynamically_generated" } """ date_format = '%Y%m%d' for day in alldays: for phone_stream in phone_input_streams: current_date = datetime.strptime(day, date_format) day_data = self.get_day_data(userid, day, phone_stream) data_quality_analysis = [] if len(day_data): corrupt_data = \ self.get_corrupt_data(day_data, phone_input_streams[phone_stream]) utc_offset = day_data[0].start_time.utcoffset( ).total_seconds() * 1000 dp = DataPoint(start_time=current_date, end_time=current_date + timedelta(days=1), offset=utc_offset, sample=[len(day_data), corrupt_data]) data_quality_analysis.append(dp) else: next_day = current_date + timedelta(days=1) utc_offset = 0 dp = DataPoint(start_time=current_date, end_time=next_day, offset=utc_offset, sample=[0, []]) data_quality_analysis.append(dp) metadata_json = json.loads(metadata) metadata_name = phone_stream + '_corrupt_data' output_stream_id = str( uuid.uuid3(uuid.NAMESPACE_DNS, str(metadata_name + userid + str(metadata)))) input_streams = [] input_stream_ids = self.CC.get_stream_id(userid, phone_stream) for inpstrm in input_stream_ids: stream_info = {} stream_info['name'] = phone_stream stream_info['identifier'] = inpstrm['identifier'] input_streams.append(stream_info) metadata_json["execution_context"]["processing_module"][ "input_streams"] = input_streams quality_ds = DataStream( identifier=output_stream_id, owner=userid, name=metadata_name, data_descriptor=metadata_json['data_descriptor'], execution_context=metadata_json['execution_context'], annotations=metadata_json['annotations'], stream_type=1, data=data_quality_analysis) try: self.CC.save_stream(quality_ds) except Exception as e: print(e) def get_corrupt_data(self, data, admission_control=None): """ Return the filtered list of DataPoints according to the admission control provided :param List(DataPoint) data: Input data list :param Callable[[Any], bool] admission_control: Admission control lambda function, which accepts the sample and returns a bool based on the data sample validity :return: Filtered list of DataPoints :rtype: List(DataPoint) """ if admission_control is None: return [] corrupt_data = [] for d in data: if type(d.sample) is list: if not admission_control(d.sample): if len(d.sample) == 1: if not admission_control(d.sample[0]): corrupt_data.append(d) else: corrupt_data.append(d) elif not admission_control(d.sample): corrupt_data.append(d) return corrupt_data
class SqlToCCStream(): def __init__(self, config): self.CC = CerebralCortex(config) self.config = self.CC.config self.sqlData = SqlData(self.config, dbName="environmental_data_collection") self.process() def process(self): user_ids = self.filter_user_ids() # get all locations lats/longs all_locations = self.sqlData.get_latitude_llongitude() with open("weather_data.json", "r") as wd: metadata = wd.read() metadata = json.loads(metadata) input_stream_name = 'LOCATION--org.md2k.phonesensor--PHONE' for uid in user_ids: stream_ids = self.CC.get_stream_id(uid, input_stream_name) # START TEST CODE # location_id = self.get_location_id((37.439168,-122.086283), all_locations) # day = datetime.strptime("20171221", "%Y%m%d").strftime("%Y-%m-%d") # weather_data = self.sqlData.get_weather_data_by_city_id(location_id, day) # dps = [] # # for wd in weather_data: # dp_sample = [] # wd["temperature"] = json.loads(wd["temperature"]) # wd["wind"] = json.loads(wd["wind"]) # # dp_sample["sunrise"] = wd["sunrise"] # dp_sample["sunset"] = wd["sunset"] # dp_sample["wind_deg"] = wd.get("wind").get("deg","") # dp_sample["wind_speed"] = wd.get("wind").get("speed","") # dp_sample["current_temp"] = wd["temperature"]["temp"] # dp_sample["max_temp"] = wd["temperature"]["temp_max"] # dp_sample["min_temp"] = wd["temperature"]["temp_min"] # dp_sample["humidity"] = int(wd["humidity"]) # dp_sample["clouds"] = int(wd["clouds"]) # dp_sample["other"] = wd["other"] # dp_sample = [wd["sunrise"],wd["sunset"],wd.get("wind").get("deg",""),wd.get("wind").get("speed",""),wd["temperature"]["temp"],wd["temperature"]["temp_max"],wd["temperature"]["temp_min"],int(wd["humidity"]),int(wd["clouds"]),wd["other"]] # dps.append(DataPoint(wd["start_time"], None, None, dp_sample)) # END TEST CODE if len(stream_ids) > 0: print("Processing:", uid) for sid in stream_ids: sid = sid["identifier"] days = self.CC.get_stream_days(sid) for day in days: print("User ID, Stream ID, Day", uid, sid, day) output_stream_id = "" # get gps data from stream-name 'LOCATION--org.md2k.phonesensor--PHONE' location_stream = self.CC.get_stream(stream_id=sid, day=day) if len(location_stream.data) > 0: # compute median on lat. and long. vals user_loc = self.compute_lat_long_median( location_stream.data) if user_loc != (0, 0): offset = location_stream.data[0].offset # get weather data for match lat/long values location_id = self.get_location_id( user_loc, all_locations) if location_id is not None: formated_day = datetime.strptime( day, "%Y%m%d").strftime("%Y-%m-%d") weather_data = self.sqlData.get_weather_data_by_city_id( location_id, formated_day) # convert data into datastream execution_context = metadata[ "execution_context"] input_streams_metadata = [{ "id": sid, "name": input_stream_name }] metadata["execution_context"]["processing_module"]["input_streams"] \ = input_streams_metadata dps = [] for wd in weather_data: dp_sample = [] wd["temperature"] = json.loads( wd["temperature"]) wd["wind"] = json.loads(wd["wind"]) day_light_duration = ( (wd["sunset"] - wd["sunrise"]).seconds ) / 3600 # difference in hours dp_sample = [ wd["sunrise"], wd["sunset"], day_light_duration, wd.get("wind", float('nan')).get( "deg", float('nan')), wd.get("wind", float('nan')).get( "speed", float('nan')), wd["temperature"]["temp"], wd["temperature"]["temp_max"], wd["temperature"]["temp_min"], int(wd["humidity"]), int(wd["clouds"]), wd["other"] ] dps.append( DataPoint(wd["start_time"], None, offset, dp_sample)) if len(dps) > 0: # generate UUID for stream output_stream_id = str( metadata["data_descriptor"]) + str( execution_context) + str( metadata["annotations"]) output_stream_id += "weather-data-stream" output_stream_id += "weather-data-stream" output_stream_id += str(uid) output_stream_id += str(sid) # output_stream_id += str(day) output_stream_id = str( uuid.uuid3(uuid.NAMESPACE_DNS, output_stream_id)) ds = DataStream( identifier=output_stream_id, owner=uid, name=metadata["name"], data_descriptor=metadata[ "data_descriptor"], execution_context=execution_context, annotations=metadata[ "annotations"], stream_type=metadata["type"], data=dps) # store data stream self.CC.save_stream(ds) def compute_lat_long_median(self, data): latitude = [] longitude = [] valid_data = False for dp in data: if isinstance(dp.sample, list) and len(dp.sample) == 6: latitude.append(dp.sample[0]) longitude.append(dp.sample[1]) valid_data = True if valid_data: return statistics.median(latitude), statistics.median(longitude) else: return 0, 0 def get_location_id(self, user_loc, all_locations): # find distance between user location and weather lat/long closest = None location_id = None for loc in all_locations: distance = haversine( user_loc, (float(loc["latitude"]), float(loc["longitude"])), miles=True) if closest is None: closest = distance location_id = loc["id"] elif distance < closest: closest = distance location_id = loc["id"] if closest <= 30: #if distance is below then 30 miles then select it as weather location return location_id else: return None def filter_user_ids(self): active_users = [] all_users = [] for uid in self.CC.get_all_users("mperf"): all_users.append(uid["identifier"]) data_dir = self.config["data_replay"]["data_dir"] for owner_dir in os.scandir(data_dir): if owner_dir.name in all_users: active_users.append(owner_dir.name) return active_users