Beispiel #1
0
class InputStreamsAnalyzer():
    """
    This class is responsible for computing features based on streams of data
    derived from the smartphone sensors.
    """
    def get_day_data(self, userid, day, stream_name, localtime=False):
        """
        Return the filtered list of DataPoints according to the admission control provided

        :param List(DataPoint) data: Input data list

        1 - Data is present and passes admission control
        0 - No data present
        -1 - Data is present and fails admission control
        """
        data = []
        stream_ids = self.CC.get_stream_id(userid, stream_name)
        for stream_id in stream_ids:
            if stream_id is not None:
                ds = self.CC.get_stream(stream_id['identifier'],
                                        user_id=userid,
                                        day=day,
                                        localtime=localtime)
                if ds is not None:
                    if ds.data is not None:
                        data += ds.data
        if len(stream_ids) > 1:
            data = sorted(data, key=lambda x: x.start_time)

        return data

    def analyze_all_users(self, userids, alldays, config_path):
        x = 0
        for usr in userids:
            print('Analyzing user %d %s' % (x, usr))
            self.analyze_user(usr, alldays, config_path)
            x += 1

    def analyze_user(self, userid, alldays, config_path):
        print(userid, alldays)
        self.CC = CerebralCortex(config_path)
        self.window_size = 3600
        metadata = """
        {
          "annotations":[],
          "data_descriptor":[
            {
              "name":"total_datapoints",
              "type":"int",
              "description":"Total number of data points that are present in the input stream followed by an array of the corrupt datapoints",
              "stream_type": "sparse"
            }
          ],
          "execution_context":{
            "processing_module":{
              "name":"core.admission_control_marker.phone_stream_analyzer",
              "input_streams":[
                {
                  "name":"name",
                  "identifier" : "id"
                }
              ]
            },
            "algorithm":{
              "method":"core.admission_control_marker",
              "authors":[
                {
                  "name":"Anand",
                  "email":"*****@*****.**"
                }
              ],
              "version":"0.0.4",
              "description":"Analyzer for the phone input streams"
            }
          },
          "name":"NAME_dynamically_generated"
        }
        """

        date_format = '%Y%m%d'
        for day in alldays:
            for phone_stream in phone_input_streams:
                current_date = datetime.strptime(day, date_format)
                day_data = self.get_day_data(userid, day, phone_stream)
                data_quality_analysis = []

                if len(day_data):
                    corrupt_data = \
                                self.get_corrupt_data(day_data,
                                                             phone_input_streams[phone_stream])

                    utc_offset = day_data[0].start_time.utcoffset(
                    ).total_seconds() * 1000
                    dp = DataPoint(start_time=current_date,
                                   end_time=current_date + timedelta(days=1),
                                   offset=utc_offset,
                                   sample=[len(day_data), corrupt_data])
                    data_quality_analysis.append(dp)

                else:
                    next_day = current_date + timedelta(days=1)
                    utc_offset = 0
                    dp = DataPoint(start_time=current_date,
                                   end_time=next_day,
                                   offset=utc_offset,
                                   sample=[0, []])
                    data_quality_analysis.append(dp)

                metadata_json = json.loads(metadata)
                metadata_name = phone_stream + '_corrupt_data'
                output_stream_id = str(
                    uuid.uuid3(uuid.NAMESPACE_DNS,
                               str(metadata_name + userid + str(metadata))))
                input_streams = []
                input_stream_ids = self.CC.get_stream_id(userid, phone_stream)
                for inpstrm in input_stream_ids:
                    stream_info = {}
                    stream_info['name'] = phone_stream
                    stream_info['identifier'] = inpstrm['identifier']
                    input_streams.append(stream_info)

                metadata_json["execution_context"]["processing_module"][
                    "input_streams"] = input_streams

                quality_ds = DataStream(
                    identifier=output_stream_id,
                    owner=userid,
                    name=metadata_name,
                    data_descriptor=metadata_json['data_descriptor'],
                    execution_context=metadata_json['execution_context'],
                    annotations=metadata_json['annotations'],
                    stream_type=1,
                    data=data_quality_analysis)
                try:
                    self.CC.save_stream(quality_ds)
                except Exception as e:
                    print(e)

    def get_corrupt_data(self, data, admission_control=None):
        """
        Return the filtered list of DataPoints according to the admission control provided

        :param List(DataPoint) data: Input data list
        :param Callable[[Any], bool] admission_control: Admission control lambda function, which accepts the sample and
                returns a bool based on the data sample validity
        :return: Filtered list of DataPoints
        :rtype: List(DataPoint)
        """
        if admission_control is None:
            return []
        corrupt_data = []
        for d in data:
            if type(d.sample) is list:
                if not admission_control(d.sample):
                    if len(d.sample) == 1:
                        if not admission_control(d.sample[0]):
                            corrupt_data.append(d)
                    else:
                        corrupt_data.append(d)
            elif not admission_control(d.sample):
                corrupt_data.append(d)

        return corrupt_data
class SqlToCCStream():
    def __init__(self, config):

        self.CC = CerebralCortex(config)
        self.config = self.CC.config
        self.sqlData = SqlData(self.config,
                               dbName="environmental_data_collection")
        self.process()

    def process(self):
        user_ids = self.filter_user_ids()
        # get all locations lats/longs
        all_locations = self.sqlData.get_latitude_llongitude()
        with open("weather_data.json", "r") as wd:
            metadata = wd.read()
        metadata = json.loads(metadata)
        input_stream_name = 'LOCATION--org.md2k.phonesensor--PHONE'
        for uid in user_ids:
            stream_ids = self.CC.get_stream_id(uid, input_stream_name)

            # START TEST CODE
            # location_id = self.get_location_id((37.439168,-122.086283), all_locations)
            # day = datetime.strptime("20171221", "%Y%m%d").strftime("%Y-%m-%d")
            # weather_data = self.sqlData.get_weather_data_by_city_id(location_id, day)
            # dps = []
            #
            # for wd in weather_data:
            #     dp_sample = []
            #     wd["temperature"] = json.loads(wd["temperature"])
            #     wd["wind"] = json.loads(wd["wind"])
            #
            #     dp_sample["sunrise"] = wd["sunrise"]
            #     dp_sample["sunset"] = wd["sunset"]
            #     dp_sample["wind_deg"] = wd.get("wind").get("deg","")
            #     dp_sample["wind_speed"] = wd.get("wind").get("speed","")
            #     dp_sample["current_temp"] = wd["temperature"]["temp"]
            #     dp_sample["max_temp"] = wd["temperature"]["temp_max"]
            #     dp_sample["min_temp"] = wd["temperature"]["temp_min"]
            #     dp_sample["humidity"] = int(wd["humidity"])
            #     dp_sample["clouds"] = int(wd["clouds"])
            #     dp_sample["other"] = wd["other"]
            #     dp_sample = [wd["sunrise"],wd["sunset"],wd.get("wind").get("deg",""),wd.get("wind").get("speed",""),wd["temperature"]["temp"],wd["temperature"]["temp_max"],wd["temperature"]["temp_min"],int(wd["humidity"]),int(wd["clouds"]),wd["other"]]
            #     dps.append(DataPoint(wd["start_time"], None, None, dp_sample))
            # END TEST CODE
            if len(stream_ids) > 0:
                print("Processing:", uid)
                for sid in stream_ids:
                    sid = sid["identifier"]
                    days = self.CC.get_stream_days(sid)
                    for day in days:
                        print("User ID, Stream ID, Day", uid, sid, day)
                        output_stream_id = ""
                        # get gps data from stream-name 'LOCATION--org.md2k.phonesensor--PHONE'
                        location_stream = self.CC.get_stream(stream_id=sid,
                                                             day=day)

                        if len(location_stream.data) > 0:
                            # compute median on lat. and long. vals
                            user_loc = self.compute_lat_long_median(
                                location_stream.data)
                            if user_loc != (0, 0):
                                offset = location_stream.data[0].offset
                                # get weather data for match lat/long values
                                location_id = self.get_location_id(
                                    user_loc, all_locations)

                                if location_id is not None:
                                    formated_day = datetime.strptime(
                                        day, "%Y%m%d").strftime("%Y-%m-%d")
                                    weather_data = self.sqlData.get_weather_data_by_city_id(
                                        location_id, formated_day)

                                    # convert data into datastream
                                    execution_context = metadata[
                                        "execution_context"]
                                    input_streams_metadata = [{
                                        "id":
                                        sid,
                                        "name":
                                        input_stream_name
                                    }]
                                    metadata["execution_context"]["processing_module"]["input_streams"] \
                                        = input_streams_metadata
                                    dps = []
                                    for wd in weather_data:
                                        dp_sample = []
                                        wd["temperature"] = json.loads(
                                            wd["temperature"])
                                        wd["wind"] = json.loads(wd["wind"])
                                        day_light_duration = (
                                            (wd["sunset"] -
                                             wd["sunrise"]).seconds
                                        ) / 3600  # difference in hours
                                        dp_sample = [
                                            wd["sunrise"], wd["sunset"],
                                            day_light_duration,
                                            wd.get("wind", float('nan')).get(
                                                "deg", float('nan')),
                                            wd.get("wind", float('nan')).get(
                                                "speed", float('nan')),
                                            wd["temperature"]["temp"],
                                            wd["temperature"]["temp_max"],
                                            wd["temperature"]["temp_min"],
                                            int(wd["humidity"]),
                                            int(wd["clouds"]), wd["other"]
                                        ]

                                        dps.append(
                                            DataPoint(wd["start_time"], None,
                                                      offset, dp_sample))
                                    if len(dps) > 0:
                                        # generate UUID for stream
                                        output_stream_id = str(
                                            metadata["data_descriptor"]) + str(
                                                execution_context) + str(
                                                    metadata["annotations"])
                                        output_stream_id += "weather-data-stream"
                                        output_stream_id += "weather-data-stream"
                                        output_stream_id += str(uid)
                                        output_stream_id += str(sid)
                                        # output_stream_id += str(day)
                                        output_stream_id = str(
                                            uuid.uuid3(uuid.NAMESPACE_DNS,
                                                       output_stream_id))
                                        ds = DataStream(
                                            identifier=output_stream_id,
                                            owner=uid,
                                            name=metadata["name"],
                                            data_descriptor=metadata[
                                                "data_descriptor"],
                                            execution_context=execution_context,
                                            annotations=metadata[
                                                "annotations"],
                                            stream_type=metadata["type"],
                                            data=dps)

                                        # store data stream
                                        self.CC.save_stream(ds)

    def compute_lat_long_median(self, data):
        latitude = []
        longitude = []
        valid_data = False
        for dp in data:
            if isinstance(dp.sample, list) and len(dp.sample) == 6:
                latitude.append(dp.sample[0])
                longitude.append(dp.sample[1])
                valid_data = True
        if valid_data:
            return statistics.median(latitude), statistics.median(longitude)
        else:
            return 0, 0

    def get_location_id(self, user_loc, all_locations):
        # find distance between user location and weather lat/long
        closest = None
        location_id = None
        for loc in all_locations:
            distance = haversine(
                user_loc, (float(loc["latitude"]), float(loc["longitude"])),
                miles=True)
            if closest is None:
                closest = distance
                location_id = loc["id"]
            elif distance < closest:
                closest = distance
                location_id = loc["id"]
        if closest <= 30:  #if distance is below then 30 miles then select it as weather location
            return location_id
        else:
            return None

    def filter_user_ids(self):

        active_users = []
        all_users = []
        for uid in self.CC.get_all_users("mperf"):
            all_users.append(uid["identifier"])

        data_dir = self.config["data_replay"]["data_dir"]
        for owner_dir in os.scandir(data_dir):
            if owner_dir.name in all_users:
                active_users.append(owner_dir.name)

        return active_users