예제 #1
0
    def window_example(self):
        """
        This example will window phone battery stream into 1 minutes chunks and take the average of battery level

        """

        # get sample stream data
        ds = self.CC.get_stream(self.stream_name)

        new_ds = ds.window(windowDuration=60)
        new_ds.show(5)

        # save newly create data as a new stream in cerebralcortex
        new_stream_name = "BATTERY--org.md2k.phonesensor--PHONE-windowed-data"

        new_ds.metadata.set_name(new_stream_name).set_description("1 minute windowed data of phone battery with average battery levels of each window.") \
            .add_dataDescriptor(
            DataDescriptor().set_attribute("description", "start/end time of a window")) \
            .add_dataDescriptor(
            DataDescriptor().set_attribute("description", "average battery values of a window")) \
            .add_module(
            ModuleMetadata().set_name("cerebralcortex.examples.main").set_version("0.1.2").set_attribute("description", "CerebralCortex-kernel example code to window phone battery data").set_author(
                "test_user", "test_user@test_email.com"))

        if self.CC.save_stream(new_ds):
            print(new_stream_name, "has been stored.\n\n")
def upload_stream_data(base_url: str, username: str, password: str,
                       stream_name: str, data_file_path: str):
    """
    Upload stream data to cerebralcortex storage using CC-ApiServer

    Args:
        base_url (str): base url of CerebralCortex-APIServer. For example, http://localhost/
        username (str): username
        password (str): password of the user
        data_file_path (str): stream data file path that needs to be uploaded

    Raises:
        Exception: if stream data upload fails

    """

    login_url = base_url + "api/v3/user/login"
    register_stream_url = base_url + "api/v3/stream/register"
    user_metadata = {
        "username": username,
        "password": password,
        "user_role": "demo-user",
        "user_metadata": {
            "key": "demo-md",
            "value": "demo-vmd"
        },
        "user_settings": {
            "key": "string",
            "value": "string"
        }
    }
    metadata = Metadata().set_name(stream_name).set_description("mobile phone accelerometer sensor data.") \
        .add_dataDescriptor(
        DataDescriptor().set_name("accelerometer_x").set_type("float").set_attribute("description", "acceleration minus gx on the x-axis")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("accelerometer_y").set_type("float").set_attribute("description", "acceleration minus gy on the y-axis")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("accelerometer_z").set_type("float").set_attribute("description", "acceleration minus gz on the z-axis")) \
        .add_module(
        ModuleMetadata().set_name("cerebralcortex.streaming_operation.main").set_version("2.0.7").set_attribute("description", "data is collected using mcerebrum.").set_author(
            "test_user", "test_user@test_email.com"))

    stream_metadata = metadata.to_json()
    user_registration_url = base_url + "api/v3/user/register"
    client.register_user(url=user_registration_url,
                         user_metadata=user_metadata)

    auth = client.login_user(login_url, username, password)

    status = client.register_stream(register_stream_url,
                                    auth.get("auth_token"), stream_metadata)

    stream_upload_url = base_url + "api/v3/stream/" + status.get("hash_id")
    result = client.upload_stream_data(stream_upload_url,
                                       auth.get("auth_token"), data_file_path)

    print(result)
def process_save_stream(msg: dict, cc_config_path: str):
    """
    Process one of kafka messages, add gaussian noise to data and store data as a new stream

    Args:
        msg (dict): kafka message - {'filename': str, 'metadata_hash': str, "stream_name": str, "user_id": str}
        cc_config_path (str): path of cerebralcortex configs

    Notes:
        This method creates CC object again. This code is running on worker node. Thus, it won't have access to CC object created in run()
        CC object cannot be passed to worker nodes because it contains sockets and sockets cannot be serialized in spark to pass as a parameter

    """

    # Disable pandas warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)

    CC = Kernel(cc_config_path, enable_spark=False)
    cc_config = CC.config
    stream_name = msg.get("stream_name")
    user_id = msg.get("user_id")

    if cc_config["nosql_storage"] == "filesystem":
        file_name = str(
            cc_config["filesystem"]["filesystem_path"]) + msg.get("filename")
    elif cc_config["nosql_storage"] == "hdfs":
        file_name = str(
            cc_config["hdfs"]["raw_files_dir"]) + msg.get("filename")
    else:
        raise Exception(
            str(cc_config["nosql_storage"]) +
            " is not supported. Please use filesystem or hdfs.")

    if os.path.exists(file_name):
        data = pq.read_table(file_name)
        pdf = data.to_pandas()

        pdf = add_gaussian_noise(pdf)

        new_stream_name = stream_name + "_gaussian_noise"

        metadata = Metadata().set_name(new_stream_name).set_description("Gaussian noise added to the accel sensor stream.") \
            .add_dataDescriptor(
            DataDescriptor().set_attribute("description", "noisy accel x")) \
            .add_dataDescriptor(
            DataDescriptor().set_attribute("description", "noisy accel y")) \
            .add_dataDescriptor(
            DataDescriptor().set_attribute("description", "noisy accel z")) \
            .add_module(
            ModuleMetadata().set_name("cerebralcortex.streaming_operation.main").set_version("0.0.1").set_attribute("description", "Spark streaming example using CerebralCortex. This example adds gaussian noise to a stream data.").set_author(
                "test_user", "test_user@test_email.com"))

        pdf["user"] = user_id
        ds = DataStream(data=pdf, metadata=metadata)
        CC.save_stream(ds)
    else:
        print(file_name, "does not exist.")
def get_metadata():
    stream_name = 'fill in your stream name'
    stream_metadata = Metadata()
    stream_metadata.set_name(stream_name).set_description("Sequence Aligment, Timestamp Correction and Decoding of MotionsenseHRV") \
        .add_dataDescriptor(
        DataDescriptor().set_name("red").set_type("float").set_attribute("description", \
                                                                         "Value of Red LED - PPG")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("infrared").set_type("float").set_attribute("description", \
                                                                              "Value of Infrared LED - PPG")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("green").set_type("float").set_attribute("description", \
                                                                           "Value of Green LED - PPG")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("aclx").set_type("float").set_attribute("description", \
                                                                          "Wrist Accelerometer X-axis")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("acly").set_type("float").set_attribute("description", \
                                                                          "Wrist Accelerometer Y-axis")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("aclz").set_type("float").set_attribute("description", \
                                                                          "Wrist Accelerometer Z-axis")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("gyrox").set_type("float").set_attribute("description", \
                                                                           "Wrist Gyroscope X-axis")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("gyroy").set_type("float").set_attribute("description", \
                                                                           "Wrist Gyroscope Y-axis")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("gyroz").set_type("float").set_attribute("description", \
                                                                           "Wrist Gyroscope Z-axis")).add_module( \
        ModuleMetadata().set_name("cerebralcortex.algorithms.raw_byte_decode.motionsenseHRV.py").set_attribute("url", "hhtps://md2k.org").set_author(
            "Md Azim Ullah", "*****@*****.**"))
    return stream_metadata
예제 #5
0
def generate_metadata_hourly():
    stream_metadata = Metadata()
    stream_metadata.set_name('mcontain-md2k--visualization-stats--time-window').set_description('Computes visualization stats every time window defined by start time and end time') \
        .add_dataDescriptor(
        DataDescriptor().set_name("start_time").set_type("timestamp").set_attribute("description", \
                                                                                    "Start time of the time window localtime")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("end_time").set_type("timestamp").set_attribute("description", \
                                                                                  "End time of the time window in localtime")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("latitude").set_type("double").set_attribute("description", \
                                                                               "Latitude of centroid location, a gps cluster output grouping encounters in similar location together")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("longitude").set_type("double").set_attribute("description", \
                                                                                "Longitude of centroid location, a gps cluster output grouping encounters in similar location together")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("n_users").set_type("integer").set_attribute("description", \
                                                                               "Number of unique users in that cluster centroid")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("total_encounters").set_type("double").set_attribute("description", \
                                                                                       "Total encounters happening in the time window in this specific location")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("normalized_total_encounters").set_type("double").set_attribute("description", \
                                                                                                  "Total encounters normalized by the centroid area. (encounters per 10 square meter)")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("avg_encounters").set_type("double").set_attribute("description", \
                                                                                     "average encounter per participant(participants who had at least one encounter)"))
    stream_metadata.add_module(
        ModuleMetadata().set_name('Visualization stats computation in a time window between start time and end time') \
            .set_attribute("url", "https://mcontain.md2k.org").set_author(
            "Md Azim Ullah", "*****@*****.**"))
    return stream_metadata
예제 #6
0
 def get_metadata():
     stream_metadata = Metadata()
     stream_metadata.set_name(stream_name).set_description("stress likelihood computed from ECG") \
         .add_input_stream(stress_features_normalized.metadata.get_name()) \
         .add_dataDescriptor(
         DataDescriptor().set_name("stress_probability")
             .set_type("double").set_attribute("description","stress likelihood computed from ECG only model")
             .set_attribute("threshold","0.47")) \
         .add_dataDescriptor(
         DataDescriptor().set_name("window")
             .set_type("struct")
             .set_attribute("description", "window start and end time in UTC")
             .set_attribute('start', 'start of 1 minute window')
             .set_attribute('end','end of 1 minute window')) \
         .add_module(
         ModuleMetadata().set_name("ECG Stress Model")
             .set_attribute("url", "http://md2k.org/")
             .set_attribute('algorithm','cStress')
             .set_attribute('unit','ms').set_author("Md Azim Ullah", "*****@*****.**"))
     return stream_metadata
예제 #7
0
def gen_phone_battery_metadata() -> Metadata:
    """
    Create Metadata object with some sample metadata of phone battery data

    Returns:
        Metadata: metadata of phone battery stream
    """
    stream_metadata = Metadata()
    stream_metadata.set_study_name("default").set_description("this is a test-stream.").set_name("BATTERY--org.md2k.phonesensor--PHONE") \
        .add_dataDescriptor(
        DataDescriptor().set_name("timestamp").set_type("datetime").set_attribute("description", "UTC timestamp")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("localtime").set_type("datetime").set_attribute("description", "local timestamp")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("battery_level").set_type("float").set_attribute("description", "current battery charge")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("version").set_type("int").set_attribute("description", "stream version")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("user").set_type("string").set_attribute("description", "user id")) \
        .add_module(
        ModuleMetadata().set_name("battery").set_version("1.2.4").set_attribute("attribute_key", "attribute_value").set_author(
            "test_user", "test_user@test_email.com"))
    stream_metadata.is_valid()
    return stream_metadata
예제 #8
0
def gen_phone_battery_metadata(stream_name) -> Metadata:
    """
    Create Metadata object with some sample metadata of phone battery data
    Returns:
        Metadata: metadata of phone battery stream
    """
    stream_metadata = Metadata()
    stream_metadata.set_study_name("default").set_name(stream_name).set_description("mobile phone battery sample data stream.") \
        .add_dataDescriptor(
        DataDescriptor().set_name("level").set_attribute("description", "current battery charge")) \
        .add_module(
        ModuleMetadata().set_name("battery").set_version("1.2.4").set_attribute("attribute_key", "attribute_value").set_author(
            "Nasir Ali", "*****@*****.**"))
    stream_metadata.is_valid()
    return stream_metadata
예제 #9
0
def gen_battery_data(CC, study_name, user_id, stream_name, version=1, hours=1):
    """
    Create pyspark dataframe with some sample phone battery data
    Returns:
        DataFrame: pyspark dataframe object with columns: ["timestamp", "battery_level", "version", "user"]

    """
    column_name = [
        "timestamp", "localtime", "user", "version", "level", "voltage",
        "temperature"
    ]
    sample_data = []
    timestamp = datetime(2019, 1, 9, 11, 34, 59)
    sample = 100
    voltage = 3700
    temperature = 70
    sqlContext = get_or_create_sc("sqlContext")
    total_data = hours * 60 * 60
    for row in range(total_data, 1, -1):
        sample = float(sample - 0.01)
        timestamp = timestamp + timedelta(0, 1)
        localtime = timestamp - timedelta(hours=5)
        sample_data.append((timestamp, localtime, user_id, version, sample,
                            voltage, temperature))
    df = sqlContext.createDataFrame(sample_data, column_name)

    stream_metadata = Metadata()
    stream_metadata.set_study_name(study_name).set_name(stream_name).set_description("battery sample data stream.") \
        .add_dataDescriptor(
        DataDescriptor().set_name("timestamp").set_type("datetime").set_attribute("description", "UTC timestamp of data point collection.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("localtime").set_type("datetime").set_attribute("description", "local timestamp of data point collection.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("user").set_type("string").set_attribute("description", "user id")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("version").set_type("int").set_attribute("description", "version of the data")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("level").set_type("float").set_attribute("description", "current battery charge")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("voltage").set_type("float").set_attribute("description", "current battery voltage level")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("temperature").set_type("float").set_attribute("description", "current battery temperature")) \
        .add_module(
        ModuleMetadata().set_name("battery").set_version("1.2.4").set_attribute("attribute_key", "attribute_value").set_author(
            "Nasir Ali", "*****@*****.**"))
    stream_metadata.is_valid()

    ds = DataStream(df, stream_metadata)
    CC.save_stream(ds)
예제 #10
0
    def _gen_metadata(self):
        from pyspark.sql.group import GroupedData
        if isinstance(self._data, GroupedData):
            return Metadata()
        else:
            schema = self._data.schema
            stream_metadata = Metadata()
            for field in schema.fields:
                stream_metadata.add_dataDescriptor(DataDescriptor().set_name(
                    str(field.name)).set_type(str(field.dataType)))

            stream_metadata.add_module(ModuleMetadata().set_name(
                "cerebralcortex.core.datatypes.datastream.DataStream"
            ).set_attribute("url", "https://md2k.org").set_author(
                "Nasir Ali", "*****@*****.**"))

            return stream_metadata
    def get_metadata():
        """
        generate metadata for the stream

        Returns:
            MetaData object
        """
        stream_metadata = Metadata()
        stream_metadata.set_name(stream_name).set_description("ECG RR interval in milliseconds") \
            .add_input_stream(ecg_data.metadata.get_name()) \
            .add_dataDescriptor(
            DataDescriptor().set_name("rr").set_type("float") \
                .set_attribute("description","rr interval")) \
            .add_module(
            ModuleMetadata().set_name("ecg rr interval") \
                .set_attribute("url","http://md2k.org/") \
                .set_attribute('algorithm','pan-tomkins').set_attribute('unit','ms').set_author("Md Azim Ullah", "*****@*****.**"))
        return stream_metadata
def gen_accel_gyro_data(CC, study_name, user_id, stream_name, version=1, hours=1, frequency=32):
    """
    Create pyspark dataframe with some sample phone battery data
    Returns:
        DataFrame: pyspark dataframe object with columns: ["timestamp", "battery_level", "version", "user"]

    """
    column_name = ["timestamp", "localtime", "user" ,"version", "x", "y", "z"]
    sample_data = []
    timestamp = datetime(2019, 1, 9, 11, 34, 59)

    sqlContext = get_or_create_sc("sqlContext")
    total_hours = (hours*60*60)*frequency
    for row in range(total_hours):
        x = round(random.uniform(-2,2),8)
        y = round(random.uniform(-2,2),8)
        z = round(random.uniform(-2,2),8)
        timestamp = timestamp + timedelta(milliseconds=1)
        localtime = timestamp - timedelta(hours=5)
        sample_data.append((timestamp, localtime, user_id, version, x, y, z))
    df = sqlContext.createDataFrame(sample_data, column_name)

    stream_metadata = Metadata()
    stream_metadata.set_study_name(study_name).set_name(stream_name).set_description("wrist watch sensor sample data stream.") \
        .add_dataDescriptor(
        DataDescriptor().set_name("timestamp").set_type("datetime").set_attribute("description", "UTC timestamp of data point collection.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("localtime").set_type("datetime").set_attribute("description", "local timestamp of data point collection.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("user").set_type("string").set_attribute("description", "user id")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("version").set_type("int").set_attribute("description", "version of the data")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("x").set_type("float").set_attribute("description", "x-axis")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("y").set_type("float").set_attribute("description", "y-axis")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("z").set_type("float").set_attribute("description", "z-axis")) \
        .add_module(
        ModuleMetadata().set_name("phone.sensors").set_version("1.2.4").set_attribute("attribute_key", "attribute_value").set_author(
            "Nasir Ali", "*****@*****.**"))
    stream_metadata.is_valid()

    ds = DataStream(df, stream_metadata)
    CC.save_stream(ds)
def get_metadata(stress_imputed_data, output_stream_name, input_stream_name):
    """
    generate metadata for a datastream.

    Args:
        stress_imputed_data (DataStream):
        output_stream_name (str):

    Returns:

    """
    schema = stress_imputed_data.schema
    stream_metadata = Metadata()
    stream_metadata.set_name(output_stream_name).set_description("stress imputed")\
        .add_input_stream(input_stream_name)
    for field in schema.fields:
        stream_metadata.add_dataDescriptor(DataDescriptor().set_name(
            str(field.name)).set_type(str(field.dataType)))
    stream_metadata.add_module(
        ModuleMetadata().set_name("stress forward fill imputer") \
            .set_attribute("url", "hhtps://md2k.org").set_author(
            "Md Azim Ullah", "*****@*****.**"))
    return stream_metadata
예제 #14
0
 def get_metadata():
     stream_metadata = Metadata()
     stream_metadata.set_name(stream_name).set_description("Stress episodes computed using MACD formula.") \
         .add_input_stream(ecg_stress_probability.metadata.get_name()) \
         .add_dataDescriptor(DataDescriptor().set_name("timestamp").set_type("datetime")) \
         .add_dataDescriptor(DataDescriptor().set_name("localtime").set_type("datetime")) \
         .add_dataDescriptor(DataDescriptor().set_name("version").set_type("int")) \
         .add_dataDescriptor(DataDescriptor().set_name("user").set_type("string")) \
         .add_dataDescriptor(
         DataDescriptor().set_name("stress_probability").set_type("float")) \
         .add_dataDescriptor(
         DataDescriptor().set_name("stress_episode").set_type("string").set_attribute("description", \
                                                                           "stress episodes calculated using MACD")) \
         .add_module(
         ModuleMetadata().set_name("cerebralcortex.algorithm.stress_prediction.stress_episodes.compute_stress_episodes")
             .set_attribute("url", "http://md2k.org/").set_author(
             "Anandatirtha Nandugudi", "*****@*****.**"))
     return stream_metadata
예제 #15
0
 def get_metadata():
     stream_metadata = Metadata()
     stream_metadata.set_name(stream_name).set_description("Chest ECG quality 3 seconds") \
         .add_input_stream(ecg.metadata.get_name()) \
         .add_dataDescriptor(DataDescriptor().set_name("timestamp").set_type("datetime")) \
         .add_dataDescriptor(DataDescriptor().set_name("localtime").set_type("datetime")) \
         .add_dataDescriptor(DataDescriptor().set_name("version").set_type("int")) \
         .add_dataDescriptor(DataDescriptor().set_name("user").set_type("string")) \
         .add_dataDescriptor(
         DataDescriptor().set_name("quality").set_type("string") \
             .set_attribute("description", "ECG data quality") \
             .set_attribute('Loose/Improper Attachment','Electrode Displacement') \
             .set_attribute('Sensor off Body', 'Autosense not worn') \
             .set_attribute('Battery down/Disconnected', 'No data is present - Can be due to battery down or sensor disconnection') \
             .set_attribute('Intermittent Data Loss','Not enough samples are present') \
             .set_attribute('Acceptable','Good Quality')) \
         .add_dataDescriptor(
         DataDescriptor().set_name("ecg").set_type("double").set_attribute("description", \
                                                                           "ecg sample value")) \
         .add_module(
         ModuleMetadata().set_name("ecg data quality").set_attribute("url", "http://md2k.org/").set_author(
             "Md Azim Ullah", "*****@*****.**"))
     return stream_metadata
def normalize_features(data,
                       index_of_first_order_feature=2,
                       lower_percentile=20,
                       higher_percentile=99,
                       minimum_minutes_in_day=60,
                       no_features=11,
                       epsilon=1e-8,
                       input_feature_array_name='features'):
    """

    Args:
        data:
        index_of_first_order_feature:
        lower_percentile:
        higher_percentile:
        minimum_minutes_in_day:
        no_features:
        epsilon:
        input_feature_array_name:

    Returns:

    """
    data_day = data.withColumn('day', F.date_format('localtime', 'yyyyMMdd'))
    stream_metadata = data.metadata
    stream_metadata \
        .add_input_stream(data.metadata.get_name()) \
        .add_dataDescriptor(
        DataDescriptor()
            .set_name("features_normalized")
            .set_type("array")
            .set_attribute("description","All features normalized daywise"))
    data_day = data_day.withColumn('features_normalized',
                                   F.col(input_feature_array_name))
    if 'window' in data.columns:
        data_day = data_day.withColumn(
            'start',
            F.col('window').start).withColumn(
                'end',
                F.col('window').end).drop(*['window'])
    schema = data_day._data.schema

    def weighted_avg_and_std(values, weights):
        """
        Return the weighted average and standard deviation.

        values, weights -- Numpy ndarrays with the same shape.
        """
        average = np.average(values, weights=weights)
        # Fast and numerically precise:
        variance = np.average((values - average)**2, weights=weights)
        return average, math.sqrt(variance)

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    @CC_MProvAgg('org.md2k.autosense.ecg.features', 'normalize_features',
                 "org.md2k.autosense.ecg.normalized.features",
                 ['user', 'timestamp'], ['user', 'timestamp'])
    def normalize_features(data):
        """


        Args:
            data:

        Returns:

        """
        if len(data) < minimum_minutes_in_day:
            return pd.DataFrame([], columns=data.columns)
        quals1 = np.array([1] * data.shape[0])
        feature_matrix = np.array(list(
            data[input_feature_array_name])).reshape(-1, no_features)
        ss = np.repeat(feature_matrix[:, index_of_first_order_feature],
                       np.int64(np.round(100 * quals1)))
        rr_70th = np.percentile(ss, lower_percentile)
        rr_95th = np.percentile(ss, higher_percentile)
        index = np.where(
            (feature_matrix[:, index_of_first_order_feature] > rr_70th)
            & (feature_matrix[:, index_of_first_order_feature] < rr_95th))[0]
        for i in range(feature_matrix.shape[1]):
            m, s = weighted_avg_and_std(feature_matrix[index, i],
                                        quals1[index])
            s += epsilon
            feature_matrix[:, i] = (feature_matrix[:, i] - m) / s
        data['features_normalized'] = list(
            [np.array(b) for b in feature_matrix])
        return data

    data_normalized = data_day._data.groupby(['user', 'day', 'version'
                                              ]).apply(normalize_features)
    if 'window' in data.columns:
        data_normalized = data_normalized.withColumn(
            'window', F.struct('start', 'end')).drop(*['start', 'end', 'day'])
    else:
        data_normalized = data_normalized.drop(*['day'])
    features = DataStream(data=data_normalized, metadata=stream_metadata)
    return features
예제 #17
0
def gen_stress_data(stream_name, spark_df=False):
    data = [
        [0.7, "road", "Driving", "Was Tailgated", "IN_VEHICLE"],
        [0.3, "work", "Job", "Bored / Not enough to do", "STILL"],
        [0.5, "home", "Health", "Physical inability", "STILL"],
        [0.6, "road", "Driving", "Saw a police car", "IN_VEHICLE"],
        [0.38, "work", "Job", "Technology barriers", "STILL"],
        [0.2, "home", "Finance", "Missed payment", "UNKNOWN"],
        [0.9, "work", "Finance", "Unexpected losses", "WALKING"],
        [0.54, "road", "Driving", "Difficulty in navigating", "IN_VEHICLE"],
        [0.79, "work", "Job", "Unpleasant conversation", "ON_FOOT"],
        [0.28, "road", "Health", "My eating habits", "IN_VEHICLE"],
        [
            0.47, "road", "Driving", "Indecision at a traffic intersection",
            "IN_VEHICLE"
        ],
        [0.67, "work", "Job", "Late arrival", "WALKING"],
    ]

    column_name = [
        'user', 'timestamp', 'localtime', 'version', 'start_time', 'end_time',
        'density', 'location', 'stresser_main', 'stresser_sub', 'activity'
    ]
    sample_data = []
    timestamp = datetime(2019, 1, 9, 11, 34, 59)

    for row in range(20, 1, -1):
        if row > 10:
            user_id = "00000000-afb8-476e-9872-6472b4e66b68"
        else:
            user_id = "b1117354-ce48-4325-b2e3-78b0cc932819"
        timestamp = timestamp + timedelta(
            hours=random.choice([1, 3, 7, 2, 4, 5]))
        localtime = timestamp - timedelta(hours=5)
        start_time = timestamp
        end_time = timestamp + timedelta(
            minutes=random.choice([12, 6, 8, 16, 29, 45, 2, 3, 8]))
        data_vals = random.choice(data)
        sample_data.append([
            user_id, timestamp, localtime, 1, start_time, end_time,
            data_vals[0], data_vals[1], data_vals[2], data_vals[3],
            data_vals[4]
        ])

    stream_metadata = Metadata()
    stream_metadata.set_study_name("default").set_name(stream_name).set_description("GPS sample data stream.") \
        .add_dataDescriptor(
        DataDescriptor().set_name("start_time").set_type("datetime").set_attribute("description", "start time of a stress episode.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("end_time").set_type("datetime").set_attribute("description", "end time of a stress episode.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("density").set_type("float").set_attribute("description", "density of stress")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("location").set_type("string").set_attribute("description", "location where stress episode was captured.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("stresser_main").set_type("string").set_attribute("description", "stressers' main category.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("stresser_sub").set_type("string").set_attribute("description", "stressers' sub category.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("activity").set_type("string").set_attribute("description", "physical activity name")) \
        .add_module(
        ModuleMetadata().set_name("examples.util.data_helper.gen_stress_data").set_attribute("attribute_key", "attribute_value").set_author(
            "Nasir Ali", "*****@*****.**"))
    stream_metadata.is_valid()

    if spark_df:
        sqlContext = get_or_create_sc("sqlContext")
        df = sqlContext.createDataFrame(sample_data, column_name)
    else:
        df = pd.DataFrame(sample_data, columns=column_name)

    ds = DataStream(df, stream_metadata)
    return ds
예제 #18
0
def generate_metadata_encounter():
    stream_metadata = Metadata()
    stream_metadata.set_name('mcontain-md2k-encounter--bluetooth-gps').set_description('Contains each unique encounters between two persons along with the location of encounter') \
        .add_dataDescriptor(
        DataDescriptor().set_name("start_time").set_type("timestamp").set_attribute("description", \
                                                                                    "Start time of the encounter in localtime")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("end_time").set_type("timestamp").set_attribute("description", \
                                                                                  "End time of the encounter in localtime")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("participant_identifier").set_type("string").set_attribute("description", \
                                                                                             "Participant with whom encounter happened")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("os").set_type("string").set_attribute("description", \
                                                                         "Operating system of the phone belonging to user")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("latitude").set_type("double").set_attribute("description", \
                                                                               "Latitude of encounter location")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("longitude").set_type("double").set_attribute("description", \
                                                                                "Longitude of encounter location")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("distances").set_type("array").set_attribute("description", \
                                                                               "Mean distance between participants in encounter")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("distance_mean").set_type("array").set_attribute("description", \
                                                                                   "Mean distance in the encounter")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("distance_std").set_type("array").set_attribute("description", \
                                                                                  "Standard deviation of distances in encounter")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("distance_count").set_type("array").set_attribute("description", \
                                                                                    "Number of distances in encounter less than distance threshold")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("average_count").set_type("double").set_attribute("description", \
                                                                                    "Average count of values received in phone per minute - average across the encounter")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("covid").set_type("integer").set_attribute("description", \
                                                                             "0, 1 or 2 indicating if this encounter contained a covid user -- 0 - no covid-19 affected, 1 - user is, 2 - participant identifier is"))
    stream_metadata.add_module(
        ModuleMetadata().set_name('Encounter computation after parsing raw bluetooth-gps data, clustering gps locations and removing double counting') \
            .set_attribute("url", "https://mcontain.md2k.org").set_author(
            "Md Azim Ullah", "*****@*****.**"))
    return stream_metadata
예제 #19
0
def gen_location_datastream(user_id, stream_name) -> object:
    """
    Create pyspark dataframe with some sample gps data (Memphis, TN, lat, long, alt coordinates)

    Args:
        user_id (str): id of a user
        stream_name (str): sample gps stream name

    Returns:
        DataStream: datastream object of gps location stream with its metadata

    """
    column_name = [
        "timestamp", "localtime", "user", "version", "latitude", "longitude",
        "altitude", "speed", "bearing", "accuracy"
    ]
    sample_data = []
    timestamp = datetime(2019, 9, 1, 11, 34, 59)
    sqlContext = get_or_create_sc("sqlContext")

    lower_left = [35.079678, -90.074136]
    upper_right = [35.194771, -89.868766]
    alt = [i for i in range(83, 100)]

    for location in range(5):
        lat = random.uniform(lower_left[0], upper_right[0])
        long = random.uniform(lower_left[1], upper_right[1])
        for dp in range(150):
            lat_val = random.gauss(lat, 0.001)
            long_val = random.gauss(long, 0.001)
            alt_val = random.choice(alt)

            speed_val = round(random.uniform(0.0, 5.0), 6)
            bearing_val = round(random.uniform(0.0, 350), 6)
            accuracy_val = round(random.uniform(10.0, 30.4), 6)

            timestamp = timestamp + timedelta(minutes=1)
            localtime = timestamp + timedelta(hours=5)
            sample_data.append(
                (timestamp, localtime, user_id, 1, lat_val, long_val, alt_val,
                 speed_val, bearing_val, accuracy_val))

    df = sqlContext.createDataFrame(sample_data, column_name)

    stream_metadata = Metadata()
    stream_metadata.set_study_name("default").set_name(stream_name).set_description("GPS sample data stream.") \
        .add_dataDescriptor(
        DataDescriptor().set_name("timestamp").set_type("datetime").set_attribute("description", "UTC timestamp of data point collection.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("localtime").set_type("datetime").set_attribute("description", "local timestamp of data point collection.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("user").set_type("string").set_attribute("description", "user id")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("version").set_type("int").set_attribute("description", "version of the data")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("latitude").set_type("float").set_attribute("description", "gps latitude")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("longitude").set_type("float").set_attribute("description", "gps longitude")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("altitude").set_type("float").set_attribute("description", "gps altitude")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("speed").set_type("float").set_attribute("description", "speed info")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("bearing").set_type("float").set_attribute("description", "bearing info")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("accuracy").set_type("float").set_attribute("description", "accuracy of gps location")) \
        .add_module(
        ModuleMetadata().set_name("examples.util.data_helper.gen_location_data").set_attribute("attribute_key", "attribute_value").set_author(
            "Nasir Ali", "*****@*****.**"))
    stream_metadata.is_valid()

    ds = DataStream(data=df, metadata=stream_metadata)
    return ds
예제 #20
0
 def get_metadata():
     stream_metadata = Metadata()
     stream_metadata.set_name(stream_name).set_description("HRV Features from ECG RR interval") \
         .add_input_stream(rr_data.metadata.get_name()) \
         .add_dataDescriptor(
         DataDescriptor()
             .set_name("var")
             .set_type("double")
             .set_attribute("description","variance")) \
         .add_dataDescriptor(
         DataDescriptor()
             .set_name("iqr")
             .set_type("double")
             .set_attribute("description","Inter Quartile Range")) \
         .add_dataDescriptor(
         DataDescriptor()
             .set_name("mean")
             .set_type("double")
             .set_attribute("description","Mean RR Interval")) \
         .add_dataDescriptor(
         DataDescriptor()
             .set_name("median")
             .set_type("double")
             .set_attribute("description","Median RR Interval")) \
         .add_dataDescriptor(
         DataDescriptor()
             .set_name("80th")
             .set_type("double")
             .set_attribute("description","80th percentile RR Interval")) \
         .add_dataDescriptor(
         DataDescriptor()
             .set_name("20th")
             .set_type("double")
             .set_attribute("description","20th percentile RR Interval")) \
         .add_dataDescriptor(
         DataDescriptor()
             .set_name("heartrate")
             .set_type("double")
             .set_attribute("description","Heart Rate in BPM")) \
         .add_dataDescriptor(
         DataDescriptor()
             .set_name("vlf")
             .set_type("double")
             .set_attribute("description","Very Low Frequency Energy")) \
         .add_dataDescriptor(
         DataDescriptor()
             .set_name("lf")
             .set_type("double")
             .set_attribute("description","Low Frequency Energy")) \
         .add_dataDescriptor(
         DataDescriptor()
             .set_name("hf")
             .set_type("double")
             .set_attribute("description","High Frequency Energy")) \
         .add_dataDescriptor(
         DataDescriptor()
             .set_name("lfhf")
             .set_type("double")
             .set_attribute("description","Low frequency to High Frequency energy ratio")) \
         .add_dataDescriptor(
         DataDescriptor()
             .set_name("window")
             .set_type("struct")
             .set_attribute("description","window start and end time in UTC")
             .set_attribute('start','start of window')
             .set_attribute('end','end of window')) \
         .add_module(
         ModuleMetadata().set_name("HRV Features from ECG RR Interval")
             .set_attribute("url", "http://md2k.org/")
             .set_attribute('algorithm','ecg feature computation')
             .set_attribute('unit','ms')
             .set_author("Md Azim Ullah", "*****@*****.**"))
     return stream_metadata
def gen_location_datastream(user_id, stream_name) -> object:
    """
    Create pyspark dataframe with some sample gps data (Memphis, TN, lat, long, alt coordinates)

    Args:
        user_id (str): id of a user
        stream_name (str): sample gps stream name

    Returns:
        DataStream: datastream object of gps location stream with its metadata

    """
    column_name = [
        "timestamp", "localtime", "user", "version", "latitude", "longitude",
        "altitude", "speed", "bearing", "accuracy"
    ]
    sample_data = []
    timestamp = datetime(2019, 1, 9, 11, 34, 59)
    sqlContext = get_or_create_sc("sqlContext")
    lat = [
        35.1247391, 35.1257391, 35.1217391, 35.1117391, 35.1317391, 35.1287391,
        35.5217391
    ]
    long = [
        -89.9750021, -89.9710021, -89.9800021, -89.9670021, -89.9790021,
        -89.9710021, -89.8700021
    ]
    alt = [83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0]
    for dp in range(500):
        lat_val = random.choice(lat)
        long_val = random.choice(long)
        alt_val = random.choice(alt)
        #ts_val = 15094)+(16272882+(dp*1000000))
        speed_val = round(random.uniform(0.0, 5.0), 6)
        bearing_val = round(random.uniform(0.0, 350), 6)
        accuracy_val = round(random.uniform(10.0, 30.4), 6)
        #all_dps = ",".join([ts_val, lat_val, long_val, alt_val, speed_val, bearing_val, accuracy_val])
        timestamp = timestamp + timedelta(minutes=1)
        localtime = timestamp + timedelta(hours=5)
        sample_data.append(
            (timestamp, localtime, user_id, 1, lat_val, long_val, alt_val,
             speed_val, bearing_val, accuracy_val))

    df = sqlContext.createDataFrame(sample_data, column_name)

    stream_metadata = Metadata()
    stream_metadata.set_name(stream_name).set_version(1).set_description("GPS sample data stream.") \
        .add_dataDescriptor(
        DataDescriptor().set_name("latitude").set_type("float").set_attribute("description", "gps latitude")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("longitude").set_type("float").set_attribute("description", "gps longitude")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("altitude").set_type("float").set_attribute("description", "gps altitude")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("speed").set_type("float").set_attribute("description", "speed info")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("bearing").set_type("float").set_attribute("description", "bearing info")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("accuracy").set_type("float").set_attribute("description", "accuracy of gps location")) \
        .add_module(
        ModuleMetadata().set_name("examples.util.data_helper.gen_location_data").set_version("0.0.1").set_attribute("attribute_key", "attribute_value").set_author(
            "test_user", "test_user@test_email.com"))
    stream_metadata.is_valid()

    return DataStream(data=df, metadata=stream_metadata)